#Stochastic Gradient Descent


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.model_selection import cross_val_score,StratifiedKFold
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

##Importing Raw, Cleaned, Cleaned& Deduplicated Datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

data_path_raw="/content/drive/My Drive/IT1244_Team1_Project/Model & Dataset/dataset.csv"
data_path_nondeduplicated="/content/drive/My Drive/IT1244_Team1_Project/Model & Dataset/ml_cleaned.csv"
data_path_deduplicated="/content/drive/My Drive/IT1244_Team1_Project/Model & Dataset/ml_cleaned_deduplicated.csv"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
columns=['label','tweet']

df_nondeduplicated=pd.read_csv(data_path_nondeduplicated)
df_deduplicated=pd.read_csv(data_path_deduplicated)

df_nondeduplicated.head()

Unnamed: 0,label,tweet,clean_tweet,lemmatized_tweet,tweet_length
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww that bummer shoulda david carr third,awww bummer shoulda david carr third,6
1,0,is upset that he can't update his Facebook by ...,upset that cannot update facebook texting migh...,upset update facebook texting might result sch...,10
2,0,@Kenichan I dived many times for the ball. Man...,dived many times ball managed save rest bounds,dived many time ball managed save rest bound,8
3,0,my whole body feels itchy and like its on fire,whole body feels itchy like fire,whole body feel itchy like fire,6
4,0,"@nationwideclass no, it's not behaving at all....",behaving here because cannot over there,behaving,1


##Feature Extraction

###Perform TF-IDF Vectorization

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=5000, stop_words='english')


tfidf_nondeduplicated= tfidf_vectorizer.fit_transform(df_nondeduplicated['lemmatized_tweet'])
tfidf_deduplicated= tfidf_vectorizer.fit_transform(df_deduplicated['lemmatized_tweet'])

Split into train and test dataset

In [None]:
X_train_nondeduplicated, X_test_nondeduplicated, y_train_nondeduplicated, y_test_nondeduplicated = train_test_split(tfidf_nondeduplicated,df_nondeduplicated['label'], test_size=0.2, random_state=42,stratify=df_nondeduplicated['label'])
X_train_deduplicated, X_test_deduplicated, y_train_deduplicated, y_test_deduplicated = train_test_split(tfidf_deduplicated,df_deduplicated['label'], test_size=0.2, random_state=42,stratify=df_deduplicated['label'])

##Feature scaling

In [None]:
# Initialize the StandardScaler
scaler = StandardScaler(with_mean=False)

# Fit and transform the training data, then transform the test data for all datasets

X_train_nondeduplicated = scaler.fit_transform(X_train_nondeduplicated)
X_test_nondeduplicated = scaler.transform(X_test_nondeduplicated)

X_train_deduplicated = scaler.fit_transform(X_train_deduplicated)
X_test_deduplicated = scaler.transform(X_test_deduplicated)

##Model Initialization

In [None]:
sgd_model = SGDClassifier(loss='hinge', random_state=42)

##5-Fold Cross-Validation of Initial Model

###Cleaned Dataset

In [None]:
# Define the cross-validation strategy once
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation and evaluate the model
cv_scores = cross_val_score(sgd_model, X_train_nondeduplicated, y_train_nondeduplicated, cv=cv, scoring='accuracy')

cross_val_accuracy = np.mean(cv_scores)

# Output cross-validation scores for training data
print(f"Cross-validation scores for each fold: {cv_scores}")
print(f"Mean cross-validation accuracy: {cross_val_accuracy}")
print(f"Standard deviation of cross-validation accuracy: {np.std(cv_scores)}")

Cross-validation scores for each fold: [0.6811071  0.68896067 0.68908734 0.68427386 0.68938434]
Mean cross-validation accuracy: 0.686562662414878
Standard deviation of cross-validation accuracy: 0.003319279401509934


###Cleaned and Deduplicated Dataset

In [None]:
cv_scores = cross_val_score(sgd_model, X_train_deduplicated, y_train_deduplicated, cv=cv, scoring='accuracy')
cross_val_accuracy = np.mean(cv_scores)

# Output cross-validation scores for training data
print(f"Cross-validation scores for each fold: {cv_scores}")
print(f"Mean cross-validation accuracy: {cross_val_accuracy}")
print(f"Standard deviation of cross-validation accuracy: {np.std(cv_scores)}")

Cross-validation scores for each fold: [0.69592216 0.69515396 0.70027527 0.69144101 0.70264388]
Mean cross-validation accuracy: 0.6970872543371103
Standard deviation of cross-validation accuracy: 0.0039500549666779


##Evaluation and Training of Initial Model

###Cleaned Dataset

In [None]:
# Now train the model using the whole training set
sgd_model.fit(X_train_nondeduplicated, y_train_nondeduplicated)

# Evaluate the model's performance on the test set
y_pred_nondeduplicated=sgd_model.predict(X_test_nondeduplicated)

# Calculate accuracy score
accuracy = accuracy_score(y_test_nondeduplicated, y_pred_nondeduplicated)
print(f"Test Set Accuracy: {accuracy}")

#classification report
print("\nClassification Report:")
print(classification_report(y_test_nondeduplicated, y_pred_nondeduplicated))

#confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_nondeduplicated, y_pred_nondeduplicated))

Test Set Accuracy: 0.6892131529614429

Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.67      0.68      9880
           1       0.68      0.70      0.69      9857

    accuracy                           0.69     19737
   macro avg       0.69      0.69      0.69     19737
weighted avg       0.69      0.69      0.69     19737


Confusion Matrix:
[[6659 3221]
 [2913 6944]]


###Cleaned and Deduplicated Dataset

In [None]:
# Now train the model using the whole training set
sgd_model.fit(X_train_deduplicated, y_train_deduplicated)

# Evaluate the model's performance on the test set
y_pred_deduplicated=sgd_model.predict(X_test_deduplicated)

# Calculate accuracy score
accuracy = accuracy_score(y_test_deduplicated, y_pred_deduplicated)
print(f"Test Set Accuracy: {accuracy}")

#classification report
print("\nClassification Report:")
print(classification_report(y_test_deduplicated, y_pred_deduplicated))

#confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_deduplicated, y_pred_deduplicated))

Test Set Accuracy: 0.7043580683156655

Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.69      0.70      9764
           1       0.70      0.72      0.71      9763

    accuracy                           0.70     19527
   macro avg       0.70      0.70      0.70     19527
weighted avg       0.70      0.70      0.70     19527


Confusion Matrix:
[[6695 3069]
 [2704 7059]]


##Hyperparameter Tuning with 5-Fold Cross-Validation

###Cleaned Dataset

In [None]:
# Define the parameter grid to search
param_grid = {
    'loss': ['hinge'],
    'penalty': ['l2'],
    'alpha': [0.001, 0.005, 0.01],
    'learning_rate': ['constant', 'optimal'],
    'eta0': [0.001, 0.005, 0.01],
    'max_iter': [1000, 1500],
    'tol': [1e-4, 1e-3]
}
# Initialize GridSearchCV
grid_search = GridSearchCV(sgd_model, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train_nondeduplicated, y_train_nondeduplicated)

best_sgd_model = grid_search.best_estimator_
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Hyperparameters: {'alpha': 0.005, 'eta0': 0.001, 'learning_rate': 'optimal', 'loss': 'hinge', 'max_iter': 1000, 'penalty': 'l2', 'tol': 0.0001}
Best Cross-Validation Score: 0.7231835043472891


###Cleaned and Deduplicated Dataset

In [None]:
# Define the parameter grid to search
param_grid = {
    'loss': ['hinge'],
    'penalty': ['l2'],
    'alpha': [0.001, 0.005, 0.01],
    'learning_rate': ['constant', 'optimal'],
    'eta0': [0.001, 0.005, 0.01],
    'max_iter': [1000, 1500],
    'tol': [1e-4, 1e-3]
}
# Initialize GridSearchCV
grid_search = GridSearchCV(sgd_model, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train_deduplicated, y_train_deduplicated)

best_sgd_model = grid_search.best_estimator_
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Hyperparameters: {'alpha': 0.001, 'eta0': 0.001, 'learning_rate': 'optimal', 'loss': 'hinge', 'max_iter': 1000, 'penalty': 'l2', 'tol': 0.0001}
Best Cross-Validation Score: 0.7301837270341206


##Evaluation and Training of Tuned Model


In [None]:
# Now train the model using the whole training set
SGD_model=best_sgd_model.fit(X_train_nondeduplicated, y_train_nondeduplicated)

# Evaluate the model's performance on the test set
y_pred_nondeduplicated= best_sgd_model.predict(X_test_nondeduplicated)

# Calculate accuracy score
accuracy = accuracy_score(y_test_nondeduplicated, y_pred_nondeduplicated)
print(f"Test Set Accuracy: {accuracy}")

#classification report
print("\nClassification Report:")
print(classification_report(y_test_nondeduplicated, y_pred_nondeduplicated))

#confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_nondeduplicated, y_pred_nondeduplicated))

Test Set Accuracy: 0.7191062471500228

Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.69      0.71      9880
           1       0.71      0.75      0.73      9857

    accuracy                           0.72     19737
   macro avg       0.72      0.72      0.72     19737
weighted avg       0.72      0.72      0.72     19737


Confusion Matrix:
[[6813 3067]
 [2477 7380]]


In [None]:
# Now train the model using the whole training set
SGD_model=best_sgd_model.fit(X_train_deduplicated, y_train_deduplicated)

# Evaluate the model's performance on the test set
y_pred_deduplicated= best_sgd_model.predict(X_test_deduplicated)

# Calculate accuracy score
accuracy = accuracy_score(y_test_deduplicated, y_pred_deduplicated)
print(f"Test Set Accuracy: {accuracy}")

#classification report
print("\nClassification Report:")
print(classification_report(y_test_deduplicated, y_pred_deduplicated))

#confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_deduplicated, y_pred_deduplicated))

Test Set Accuracy: 0.7391816459261535

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.69      0.73      9764
           1       0.72      0.79      0.75      9763

    accuracy                           0.74     19527
   macro avg       0.74      0.74      0.74     19527
weighted avg       0.74      0.74      0.74     19527


Confusion Matrix:
[[6762 3002]
 [2091 7672]]


##Exporting Model

###Cleaned Dataset

In [None]:
import os
import json
import shutil
import pickle

# Save Model to Drive
def save_model_to_drive(model, vectorizer, model_name, metrics=None):
    # Mount Google Drive
    drive.mount("/content/drive", force_remount=True)

    # Paths
    local_path = f"./{model_name}"
    drive_base_path = "/content/drive/My Drive/IT1244_Team1_Project/Model & Dataset"
    drive_path = os.path.join(drive_base_path, model_name)

    # Create local directory if it doesn't exist
    if not os.path.exists(local_path):
        os.makedirs(local_path)

    # Save model and vectorizer locally
    model_filename = os.path.join(local_path, "model.pkl")
    vectorizer_filename = os.path.join(local_path, "cleaned_vectorizer.pkl")
    with open(model_filename, "wb") as model_file:
        pickle.dump(model, model_file)
    with open(vectorizer_filename, "wb") as vectorizer_file:
        pickle.dump(vectorizer, vectorizer_file)

    # Save metrics to local folder
    if metrics:
        with open(os.path.join(local_path, "cleaned_vectorizer.json"), "w") as f:
            json.dump(metrics, f, indent=4)

    # Remove old copy in Google Drive (if exists)
    if os.path.exists(drive_path):
        shutil.rmtree(drive_path)

    # Copy to Google Drive
    shutil.copytree(local_path, drive_path)
    print(f"Model and metrics saved to Google Drive at: {drive_path}")

# Metrics for Maximum Entropy
metrics_sgd = {
    "accuracy": accuracy,
    "cross_val_accuracy": cross_val_accuracy,
    "classification_report": classification_report(y_test_nondeduplicated, y_pred_nondeduplicated, output_dict=True),
    "confusion_matrix": confusion_matrix(y_test_nondeduplicated, y_pred_nondeduplicated).tolist()
}
save_model_to_drive(SGD_model, tfidf_vectorizer, "Stochastic Gradient Descent", metrics=metrics_sgd)

Mounted at /content/drive
Model and metrics saved to Google Drive at: /content/drive/My Drive/it1244 girl boss/IT1244_Team1_Project/Model & Dataset/Stochastic Gradient Descent


###Cleaned and Duplicated Dataset

In [None]:
import os
import json
import shutil
import pickle

# Save Model to Drive
def save_model_to_drive(model, vectorizer, model_name, metrics=None):
    # Mount Google Drive
    drive.mount("/content/drive", force_remount=True)

    # Paths
    local_path = f"./{model_name}"
    drive_base_path = "/content/drive/My Drive/IT1244_Team1_Project/Model & Dataset"
    drive_path = os.path.join(drive_base_path, model_name)

    # Create local directory if it doesn't exist
    if not os.path.exists(local_path):
        os.makedirs(local_path)

    # Save model and vectorizer locally
    model_filename = os.path.join(local_path, "model.pkl")
    vectorizer_filename = os.path.join(local_path, "cleaned_deduplicated_vectorizer.pkl")
    with open(model_filename, "wb") as model_file:
        pickle.dump(model, model_file)
    with open(vectorizer_filename, "wb") as vectorizer_file:
        pickle.dump(vectorizer, vectorizer_file)

    # Save metrics to local folder
    if metrics:
        with open(os.path.join(local_path, "cleaned_deduplicated_vectorizer.json"), "w") as f:
            json.dump(metrics, f, indent=4)

    # Remove old copy in Google Drive (if exists)
    if os.path.exists(drive_path):
        shutil.rmtree(drive_path)

    # Copy to Google Drive
    shutil.copytree(local_path, drive_path)
    print(f"Model and metrics saved to Google Drive at: {drive_path}")

# Metrics for Maximum Entropy
metrics_sgd = {
    "accuracy": accuracy,
    "cross_val_accuracy": cross_val_accuracy,
    "classification_report": classification_report(y_test_deduplicated, y_pred_deduplicated, output_dict=True),
    "confusion_matrix": confusion_matrix(y_test_deduplicated, y_pred_deduplicated).tolist()
}
save_model_to_drive(SGD_model, tfidf_vectorizer, "Stochastic Gradient Descent", metrics=metrics_sgd)

Mounted at /content/drive
Model and metrics saved to Google Drive at: /content/drive/My Drive/it1244 girl boss/IT1244_Team1_Project/Model & Dataset/Stochastic Gradient Descent
