In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
from sklearn.metrics import classification_report, accuracy_score

In [14]:
# Load the data
data = pd.read_csv('/content/drive/MyDrive/NLPFinalProject/Model Implementation/balancedtask1reddit.csv')
data.head()

# Preprocess the text data
# For example, using the NLTK library:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

def preprocess_text(text):
    if isinstance(text, str):  # Check if text is a string
        tokens = word_tokenize(text.lower())
        tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]
        return ' '.join(tokens)
    else:
        return ''

data['chat1_processed'] = data['chat1'].apply(preprocess_text)
data['chat2_processed'] = data['chat2'].apply(preprocess_text)

# Extract features from the preprocessed text data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['chat1_processed'] + ' ' + data['chat2_processed'])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, data['label'], test_size=0.2, random_state=42)

# Train a SVM model on the training set
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm.predict(X_test)

# Evaluate the performance of the model
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Accuracy: 0.725
Precision: 0.7735849056603774
Recall: 0.6612903225806451
F1 score: 0.7130434782608696


In [13]:
data['label'].value_counts()

True     300
False    300
Name: label, dtype: int64

In [5]:
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

       False       0.75      0.82      0.78       213
        True       0.82      0.74      0.78       227

    accuracy                           0.78       440
   macro avg       0.78      0.78      0.78       440
weighted avg       0.78      0.78      0.78       440



In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {'C': [0.1, 1], 
              'gamma': [1, 0.1],
              'kernel': ['linear', 'rbf', 'poly']}

# Run the grid search
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=2)
grid.fit(X_train, y_train)

# Find the best parameters
print(grid.best_params_)

# Train a SVM model on the training set with the best parameters
svm_best = SVC(kernel=grid.best_params_['kernel'], C=grid.best_params_['C'], gamma=grid.best_params_['gamma'])
svm_best.fit(X_train, y_train)

# Make predictions on the test set
y_pred_best = svm_best.predict(X_test)

# Evaluate the performance of the model
print('Accuracy:', accuracy_score(y_test, y_pred_best))
print('Precision:', precision_score(y_test, y_pred_best))
print('Recall:', recall_score(y_test, y_pred_best))
print('F1 score:', f1_score(y_test, y_pred_best))


Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   7.3s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   6.8s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   7.4s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   7.1s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   7.6s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   7.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   7.6s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   7.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   7.7s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   7.1s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   7.8s
[CV] END ........................C=0.1, gamma=1,

In [None]:
print("\nClassification Report:\n", classification_report(y_test, y_pred_best))

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
# Using Lemmatization instead of Stemming
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def preprocess_text_lemmatize(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(tokens)

data['chat1_processed'] = data['chat1'].apply(preprocess_text_lemmatize)
data['chat2_processed'] = data['chat2'].apply(preprocess_text_lemmatize)

# Using a more complex model like XGBoost
from xgboost import XGBClassifier

# Extract features from the preprocessed text data
X = vectorizer.fit_transform(data['chat1_processed'] + ' ' + data['chat2_processed'])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, data['label'], test_size=0.2, random_state=42)

# Train a XGBClassifier model on the training set
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

# Make predictions on the test set
y_pred_xgb = xgb.predict(X_test)

# Evaluate the performance of the model
print('Accuracy:', accuracy_score(y_test, y_pred_xgb))
print('Precision:', precision_score(y_test, y_pred_xgb))
print('Recall:', recall_score(y_test, y_pred_xgb))
print('F1 score:', f1_score(y_test, y_pred_xgb))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


AttributeError: ignored

In [None]:
print("\nClassification Report:\n", classification_report(y_test, y_pred_xgb))


In [17]:
# Using Lemmatization instead of Stemming
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def preprocess_text_lemmatize(text):
    if isinstance(text, str):  # Check if text is a string
        tokens = word_tokenize(text.lower())
        tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
        return ' '.join(tokens)
    else:
        return ''

data['chat1_processed'] = data['chat1'].apply(preprocess_text_lemmatize)
data['chat2_processed'] = data['chat2'].apply(preprocess_text_lemmatize)


# Using a more complex model like XGBoost
from xgboost import XGBClassifier

# Extract features from the preprocessed text data
X = vectorizer.fit_transform(data['chat1_processed'] + ' ' + data['chat2_processed'])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, data['label'], test_size=0.2, random_state=42)

# Train a XGBClassifier model on the training set
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

# Make predictions on the test set
y_pred_xgb = xgb.predict(X_test)

# Evaluate the performance of the model
print('Accuracy:', accuracy_score(y_test, y_pred_xgb))
print('Precision:', precision_score(y_test, y_pred_xgb))
print('Recall:', recall_score(y_test, y_pred_xgb))
print('F1 score:', f1_score(y_test, y_pred_xgb))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Accuracy: 0.7583333333333333
Precision: 0.7704918032786885
Recall: 0.7580645161290323
F1 score: 0.7642276422764228


In [19]:
data = pd.read_csv('/content/drive/MyDrive/NLPFinalProject/Model Implementation/imbalancedtask1.csv')

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def preprocess_text_lemmatize(text):
    if isinstance(text, str):  # Check if text is a string
        tokens = word_tokenize(text.lower())
        tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
        return ' '.join(tokens)
    else:
        return ''

data['chat1_processed'] = data['chat1'].apply(preprocess_text_lemmatize)
data['chat2_processed'] = data['chat2'].apply(preprocess_text_lemmatize)

# Using a more complex model like XGBoost
from xgboost import XGBClassifier

# Extract features from the preprocessed text data
X = vectorizer.fit_transform(data['chat1_processed'] + ' ' + data['chat2_processed'])

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
# Using SMOTE for oversampling
from imblearn.over_sampling import SMOTE

# Perform oversampling before splitting into training and test sets
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, data['label'])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Use GridSearchCV for hyperparameter tuning
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.5,],
    'colsample_bytree': [0.5],
    'n_estimators' : [100],
    'objective': ['binary:logistic']
}

# Initialize the classifier
xgb = XGBClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=3, scoring='f1', verbose=2, n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters: ", best_params)

# Train a XGBClassifier model on the training set with the best parameters
xgb_best = XGBClassifier(**best_params)
xgb_best.fit(X_train, y_train)

# Make predictions on the test set
y_pred_xgb = xgb_best.predict(X_test)

# Evaluate the performance of the model
print('Accuracy:', accuracy_score(y_test, y_pred_xgb))
print('Precision:', precision_score(y_test, y_pred_xgb))
print('Recall:', recall_score(y_test, y_pred_xgb))
print('F1 score:', f1_score(y_test, y_pred_xgb))


Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best parameters:  {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 100, 'objective': 'binary:logistic', 'subsample': 0.5}
Accuracy: 0.8924242424242425
Precision: 0.922360248447205
Recall: 0.8658892128279884
F1 score: 0.8932330827067669


In [None]:
print("\nClassification Report:\n", classification_report(y_test, y_pred_xgb))


Classification Report:
               precision    recall  f1-score   support

       False       0.86      0.92      0.89       317
        True       0.92      0.87      0.89       343

    accuracy                           0.89       660
   macro avg       0.89      0.89      0.89       660
weighted avg       0.89      0.89      0.89       660

