In [9]:
# Read data
import numpy as np
import pandas as pd
# Read Parquet files
train_dataframe = pd.read_parquet(r'E:\Desktop\Text\Task1\train.parquet')
val_dataframe = pd.read_parquet(r'E:\Desktop\Text\Task1\validation.parquet')
test_dataframe = pd.read_parquet(r'E:\Desktop\Text\Task1\test.parquet')

In [10]:
# Data preprocessing and feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download nltk resources
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

# Custom text cleaning and preprocessing class
class TextPreprocessor(TransformerMixin):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def clean_text(self, text):
        # Remove user tags and URLs
        text = re.sub(r'@\w+', '', text)
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'www\S+', '', text)
        # Remove HTML tags and special characters
        text = re.sub(r'<[^>]+>', '', text)
        text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
        # Convert to lowercase
        text = text.lower()
        # Remove stopwords and lemmatize
        text = ' '.join([self.lemmatizer.lemmatize(word) for word in text.split() if word not in self.stop_words])
        return text

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [self.clean_text(text) for text in X]


# Create data preprocessing pipeline
pipeline = Pipeline([
    ('text_preprocessor', TextPreprocessor()),
    ('tfidf_vectorizer', TfidfVectorizer())
])


train_texts = train_dataframe['text']
val_texts = val_dataframe['text']
test_texts = test_dataframe['text']

# Apply preprocessing
train_features = pipeline.fit_transform(train_texts)
val_features = pipeline.transform(val_texts)
test_features = pipeline.transform(test_texts)


In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import scipy.sparse as sp

# Retrieve labels from DataFrame
y_train = train_dataframe['label']
y_val = val_dataframe['label']
y_test = test_dataframe['label']

# Merge training and validation sets
X_train_full = sp.vstack([train_features, val_features])
y_train_full = pd.concat([y_train, y_val])



# # Set the grid of parameters to adjust
# param_grid = {
#     # 'C': [0.1, 1, 10,100],  # Inverse of regularization strength    
#     # 'C': [0.5,0.75, 1, 1.25,1.5,1.75,2],  # Inverse of regularization strength
#     'C': [1, 1.25, 1.3, 1.5, 1.6, 1.75, 2],  # Inverse of regularization strength    
#     # 'gamma': ['scale', 'auto'],  # Kernel coefficient (for non-linear kernel)
#     'kernel': ['linear', 'rbf']  # Type of kernel
# }

# # Create SVM model
# svm_model = SVC(random_state=42)

# # Create a grid search object, using cross-validation to evaluate each parameter setting
# grid_search = GridSearchCV(svm_model, param_grid, cv=5, scoring='accuracy', verbose=0)

# # Perform grid search using training data
# grid_search.fit(train_features, y_train)

# # Output the best parameters and corresponding performance
# print("Best parameters:", grid_search.best_params_)
# print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))



# Create SVM model
svm_model = SVC(C = 1.25, kernel='linear', random_state=42)

# Train the model
svm_model.fit(X_train_full, y_train_full)


# Predict on the test set
test_predictions = svm_model.predict(test_features)
test_accuracy = accuracy_score(y_test, test_predictions)
print(f"Test Accuracy: {test_accuracy}")
print()

# Identify misclassified samples
misclassified_indices = np.where(test_predictions != y_test)[0]
misclassified_samples = test_dataframe.iloc[misclassified_indices]
misclassified_texts = misclassified_samples['text']
misclassified_labels = misclassified_samples['label']

# Create an instance of the text preprocessor
text_preprocessor = TextPreprocessor()

# Output the original and preprocessed texts
print("Some examples of misclassified texts:")
print()
for index in misclassified_indices[:10]:  # Display the first ten misclassified samples
    original_text = misclassified_texts.iloc[index]
    processed_text = text_preprocessor.clean_text(original_text)
    print(f"Original Text: {original_text}")
    print(f"Processed Text: {processed_text}")
    print(f"Predicted Label: {test_predictions[index]}, Actual Label: {y_test.iloc[index]}\n")


Test Accuracy: 0.6903589021815623

Some examples of misclassified texts:

Original Text: @user #shocking loss of talented young man#prayers#pray for his family
Processed Text: shocking loss talented young manprayerspray family
Predicted Label: 2, Actual Label: 1

Original Text: @user Really??? I've had to hang up!
Processed Text: really ive hang
Predicted Label: 1, Actual Label: 3

Original Text: yukwon no video do zico the world is shaking
Processed Text: yukwon video zico world shaking
Predicted Label: 0, Actual Label: 3

Original Text: IK to PMLN: 'Darling, I will haunt you in your nightmares, dressed like a dream.' BEST THING EVER. #GameOverNawaz
Processed Text: ik pmln darling haunt nightmare dressed like dream best thing ever gameovernawaz
Predicted Label: 0, Actual Label: 3

Original Text: @user Improve on the makeup dear to avoid reduction in viewership...
Processed Text: improve makeup dear avoid reduction viewership
Predicted Label: 0, Actual Label: 2

Original Text: @user Co

In [12]:
# # Prediction accuracy without preprocessing
# import numpy as np
# import pandas as pd
# train_dataframe = pd.read_parquet(r'E:\Desktop\Text\Task1\train.parquet')
# val_dataframe = pd.read_parquet(r'E:\Desktop\Text\Task1\validation.parquet')
# test_dataframe = pd.read_parquet(r'E:\Desktop\Text\Task1\test.parquet')

# # Create instance of TF-IDF Vectorizer
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.svm import SVC
# from sklearn.metrics import accuracy_score
# import scipy.sparse as sp

# # Set up TF-IDF vectorizer (without preprocessing)
# no_prep_tfidf_vectorizer = TfidfVectorizer()

# # Apply TF-IDF vectorization
# train_features_no_prep = no_prep_tfidf_vectorizer.fit_transform(train_dataframe['text'])
# val_features_no_prep = no_prep_tfidf_vectorizer.transform(val_dataframe['text'])
# test_features_no_prep = no_prep_tfidf_vectorizer.transform(test_dataframe['text'])

# # Merge training and validation sets
# X_train_full_no_prep = sp.vstack([train_features_no_prep, val_features_no_prep])
# y_train_full = pd.concat([train_dataframe['label'], val_dataframe['label']])

# # Create and train SVM model
# svm_model_no_prep = SVC(C=1.25, kernel='linear', random_state=42)
# svm_model_no_prep.fit(X_train_full_no_prep, y_train_full)

# # Predict on the test set
# test_predictions_no_prep = svm_model_no_prep.predict(test_features_no_prep)
# test_accuracy_no_prep = accuracy_score(test_dataframe['label'], test_predictions_no_prep)
# print(f"Test Accuracy without preprocessing: {test_accuracy_no_prep}")
