#**Logistic Regression Model**

In [52]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## For cleaned dataset

### Importing Dataset

In [53]:
from google.colab import drive
drive.mount('/content/drive')
data_path='/content/drive/My Drive/IT1244_Team1_Project/Model & Dataset/ml_cleaned.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [54]:
#import cleaned dataset
df=pd.read_csv(data_path)
df.head()

Unnamed: 0,label,tweet,clean_tweet,lemmatized_tweet,tweet_length
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww that bummer shoulda david carr third,awww bummer shoulda david carr third,6
1,0,is upset that he can't update his Facebook by ...,upset that cannot update facebook texting migh...,upset update facebook texting might result sch...,10
2,0,@Kenichan I dived many times for the ball. Man...,dived many times ball managed save rest bounds,dived many time ball managed save rest bound,8
3,0,my whole body feels itchy and like its on fire,whole body feels itchy like fire,whole body feel itchy like fire,6
4,0,"@nationwideclass no, it's not behaving at all....",behaving here because cannot over there,behaving,1


### Feature Extraction

In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

#### Perform TF-IDF Vectorization

In [56]:
def tfidf_vetorization(data,ngram_range=(1,2)):
    vectorizer=TfidfVectorizer(max_features=10000,ngram_range=ngram_range,min_df=3,max_df=0.7)
    tfidf_matrix=vectorizer.fit_transform(data)
    return tfidf_matrix,vectorizer

#### Split into train and test dataset

In [57]:
def train_test_splitting(X,y,test_size=0.2):
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=test_size,random_state=42)
    return X_train,X_test,y_train,y_test

In [58]:
df['lemmatized_tweet'] = df['lemmatized_tweet'].fillna("")
tfidf_matrix, vectorizer = tfidf_vetorization(df['lemmatized_tweet'])
X_train,X_test,y_train,y_test=train_test_splitting(tfidf_matrix,df['label'])
print(f"Train set shape:{X_train.shape}, Test set shape: {X_test.shape}")

Train set shape:(78944, 10000), Test set shape: (19737, 10000)


In [59]:
print(vectorizer.get_feature_names_out()[:20])

['aaaah' 'aaah' 'aaaw' 'aargh' 'aaron' 'abandoned' 'abby' 'aberdeen'
 'ability' 'abit' 'able' 'able make' 'able sleep' 'able take' 'abroad'
 'absent' 'absolute' 'absolutely' 'absolutely love' 'absolutely nothing']


### Logistic Regresion

In [60]:
#scikit learn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

#### Model Initialisation

In [61]:
#training logistic regression model for cleaned dataset
log_reg_model=LogisticRegression(max_iter=1000)
log_reg_model.fit(X_train,y_train)
y_pred=log_reg_model.predict(X_test)

#### Evaluation of Initial Model

In [62]:
#logistic regression model evaluation for cleaned dataset
accuracy_log_reg = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy_log_reg:.4f}")
print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, y_pred))
print("\nLogistic Regression Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Logistic Regression Accuracy: 0.7503

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.73      0.75      9900
           1       0.74      0.77      0.75      9837

    accuracy                           0.75     19737
   macro avg       0.75      0.75      0.75     19737
weighted avg       0.75      0.75      0.75     19737


Logistic Regression Confusion Matrix:
[[7240 2660]
 [2268 7569]]


### Perform 5-Fold Cross-Validation of Initialised Model

In [63]:
from sklearn.model_selection import cross_val_score, cross_val_predict

In [64]:
# Perform cross-validation and get predictions for cleaned dataset
y_pred_cv_log_reg=cross_val_predict(log_reg_model,tfidf_matrix,df['label'],cv=5)

#### Evaluation of Initialised Model with 5-Fold Cross-Validation

In [65]:
# Classification report
print("\nClassification Report:")
print(classification_report(df['label'],y_pred_cv_log_reg))

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(df['label'],y_pred_cv_log_reg))

# Calculate individual scores
accuracy_cv_log_reg=accuracy_score(df['label'],y_pred_cv_log_reg)
print(f"Accuracy: {accuracy_cv_log_reg:.4f}")

precision_cv = cross_val_score(log_reg_model, tfidf_matrix, df['label'], cv=5, scoring='precision_weighted').mean()
print(f"Precision: {precision_cv:.4f}")

recall_cv = cross_val_score(log_reg_model, tfidf_matrix, df['label'], cv=5, scoring='recall_weighted').mean()
print(f"Recall: {recall_cv:.4f}")

f1_cv = cross_val_score(log_reg_model, tfidf_matrix, df['label'], cv=5, scoring='f1_weighted').mean()
print(f"f1-score: {f1_cv:.4f}")


Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.72      0.73     49400
           1       0.73      0.76      0.74     49281

    accuracy                           0.74     98681
   macro avg       0.74      0.74      0.74     98681
weighted avg       0.74      0.74      0.74     98681


Confusion Matrix:
[[35363 14037]
 [11736 37545]]
Accuracy: 0.7388
Precision: 0.7395
Recall: 0.7388
f1-score: 0.7387


### Hyperparameter Tuning with 5-Fold Cross-Validation

In [66]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [67]:
# Hyperparameter Tuning for Logistic Regression for cleaned dataset
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['lbfgs', 'liblinear'],  # Different solvers for comparison
    'max_iter': [500, 1000, 1500]
}

grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_log_reg_model = grid_search.best_estimator_
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Train the best model on the full training set
best_log_reg_model.fit(X_train, y_train)

# Predict using the tuned model
y_pred_tuned = best_log_reg_model.predict(X_test)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best Hyperparameters: {'C': 1, 'max_iter': 500, 'solver': 'lbfgs'}
Best Cross-Validation Score: 0.7446923782681811


#### Evaluation of Tuned Model

In [68]:
# Evaluate tuned model for cleaned dataset
print("\nTuned Model Classification Report:")
print(classification_report(y_test,y_pred_tuned))

print("\nTuned Model Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_tuned))

# Compute evaluation metrics for the tuned model
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
precision_tuned = precision_score(y_test, y_pred_tuned, average='weighted')
recall_tuned = recall_score(y_test, y_pred_tuned, average='weighted')
f1_tuned = f1_score(y_test, y_pred_tuned, average='weighted')

# Print scores
print(f"Tuned Model Accuracy: {accuracy_tuned:.4f}")
print(f"Tuned Model Precision: {precision_tuned:.4f}")
print(f"Tuned Model Recall: {recall_tuned:.4f}")
print(f"Tuned Model F1-Score: {f1_tuned:.4f}")


Tuned Model Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.73      0.75      9900
           1       0.74      0.77      0.75      9837

    accuracy                           0.75     19737
   macro avg       0.75      0.75      0.75     19737
weighted avg       0.75      0.75      0.75     19737


Tuned Model Confusion Matrix:
[[7240 2660]
 [2268 7569]]
Tuned Model Accuracy: 0.7503
Tuned Model Precision: 0.7507
Tuned Model Recall: 0.7503
Tuned Model F1-Score: 0.7502


###Exporting Model

In [69]:
import os
import json
import shutil
import pickle

# Save Model to Drive
def save_model_to_drive(model, vectorizer, model_name, metrics=None):
    # Mount Google Drive
    drive.mount("/content/drive", force_remount=True)

    # Paths
    local_path = f"./{model_name}"
    drive_base_path = "/content/drive/My Drive/IT1244_Team1_Project/Model & Dataset"
    drive_path = os.path.join(drive_base_path, model_name)

    # Create local directory if it doesn't exist
    if not os.path.exists(local_path):
        os.makedirs(local_path)

    # Save model and vectorizer locally
    model_filename = os.path.join(local_path, "model.pkl")
    vectorizer_filename = os.path.join(local_path, "cleaned_vectorizer.pkl")
    with open(model_filename, "wb") as model_file:
        pickle.dump(model, model_file)
    with open(vectorizer_filename, "wb") as vectorizer_file:
        pickle.dump(vectorizer, vectorizer_file)

    # Save metrics to local folder
    if metrics:
        with open(os.path.join(local_path, "cleaned_metrics.json"), "w") as f:
            json.dump(metrics, f, indent=4)

    # Remove old copy in Google Drive (if exists)
    if os.path.exists(drive_path):
        shutil.rmtree(drive_path)

    # Copy to Google Drive
    shutil.copytree(local_path, drive_path)
    print(f"Model and metrics saved to Google Drive at: {drive_path}")

# Metrics for Logistic Regression
metrics_log_reg = {
    "accuracy": accuracy_log_reg,
    "cross_val_accuracy": accuracy_cv_log_reg,
    "classification_report": classification_report(y_test, y_pred, output_dict=True),
    "confusion_matrix": confusion_matrix(y_test, y_pred).tolist()
}

save_model_to_drive(log_reg_model, vectorizer, "Logistic Regression", metrics=metrics_log_reg)

Mounted at /content/drive
Model and metrics saved to Google Drive at: /content/drive/My Drive/IT1244_Team1_Project/Model & Dataset/Logistic Regression


## For cleaned and deduplicated dataset

### Importing Dataset

In [70]:
data_path_cleaned_deduplicated='/content/drive/My Drive/IT1244_Team1_Project/Model & Dataset/ml_cleaned_deduplicated.csv'

In [71]:
#import cleaned dataset
df=pd.read_csv(data_path_cleaned_deduplicated)
df.head()

Unnamed: 0,label,tweet,clean_tweet,lemmatized_tweet,tweet_length
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww that bummer shoulda david carr third,awww bummer shoulda david carr third,6
1,0,is upset that he can't update his Facebook by ...,upset that cannot update facebook texting migh...,upset update facebook texting might result sch...,10
2,0,@Kenichan I dived many times for the ball. Man...,dived many times ball managed save rest bounds,dived many time ball managed save rest bound,8
3,0,my whole body feels itchy and like its on fire,whole body feels itchy like fire,whole body feel itchy like fire,6
4,0,"@nationwideclass no, it's not behaving at all....",behaving here because cannot over there,behaving,1


### Feature Extraction

In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

####Perform TF-IDF Vectorization

In [73]:
def tfidf_vetorization(data,ngram_range=(1,2)):
    vectorizer=TfidfVectorizer(max_features=10000,ngram_range=ngram_range,min_df=3,max_df=0.7)
    tfidf_matrix=vectorizer.fit_transform(data)
    return tfidf_matrix,vectorizer

####Split into train and test dataset

In [74]:
def train_test_splitting(X,y,test_size=0.2):
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=test_size,random_state=42)
    return X_train,X_test,y_train,y_test

In [75]:
df['lemmatized_tweet'] = df['lemmatized_tweet'].fillna("")
tfidf_matrix, vectorizer = tfidf_vetorization(df['lemmatized_tweet'])
X_train,X_test,y_train,y_test=train_test_splitting(tfidf_matrix,df['label'])
print(f"Train set shape:{X_train.shape}, Test set shape: {X_test.shape}")

Train set shape:(78105, 10000), Test set shape: (19527, 10000)


In [76]:
print(vectorizer.get_feature_names_out()[:20])

['aaah' 'aargh' 'aaron' 'abandoned' 'abby' 'ability' 'abit' 'able'
 'able make' 'able sleep' 'able work' 'abroad' 'absent' 'absolute'
 'absolutely' 'absolutely gorgeous' 'absolutely nothing' 'absolutly'
 'academy' 'accent']


###Base Model - Logistic Regression

In [77]:
#scikit learn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

####Model Initialisation

In [78]:
#training logistic regression model for cleaned dataset
log_reg_model=LogisticRegression(max_iter=1000)
log_reg_model.fit(X_train,y_train)
y_pred=log_reg_model.predict(X_test)

####Evaluation of initial model

In [79]:
#logistic regression model evaluation for cleaned dataset
accuracy_log_reg = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy_log_reg:.4f}")
print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, y_pred))
print("\nLogistic Regression Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Logistic Regression Accuracy: 0.7614

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.74      0.76      9829
           1       0.75      0.79      0.77      9698

    accuracy                           0.76     19527
   macro avg       0.76      0.76      0.76     19527
weighted avg       0.76      0.76      0.76     19527


Logistic Regression Confusion Matrix:
[[7233 2596]
 [2064 7634]]


###Perform 5-Fold Cross-Validation of Initialised Model

In [80]:
from sklearn.model_selection import cross_val_score, cross_val_predict

In [81]:
# Perform cross-validation and get predictions for cleaned dataset
y_pred_cv_log_reg=cross_val_predict(log_reg_model,tfidf_matrix,df['label'],cv=5)

####Evaluation of Initialised Model with 5-Fold Cross-Validation

In [82]:
# Classification report
print("\nClassification Report:")
print(classification_report(df['label'],y_pred_cv_log_reg))

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(df['label'],y_pred_cv_log_reg))

# Calculate individual scores
accuracy_cv_log_reg=accuracy_score(df['label'],y_pred_cv_log_reg)
print(f"Accuracy: {accuracy_cv_log_reg:.4f}")

precision_cv = cross_val_score(log_reg_model, tfidf_matrix, df['label'], cv=5, scoring='precision_weighted').mean()
print(f"Precision: {precision_cv:.4f}")

recall_cv = cross_val_score(log_reg_model, tfidf_matrix, df['label'], cv=5, scoring='recall_weighted').mean()
print(f"Recall: {recall_cv:.4f}")

f1_cv = cross_val_score(log_reg_model, tfidf_matrix, df['label'], cv=5, scoring='f1_weighted').mean()
print(f"f1-score: {f1_cv:.4f}")


Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.72      0.75     48816
           1       0.74      0.78      0.76     48816

    accuracy                           0.75     97632
   macro avg       0.75      0.75      0.75     97632
weighted avg       0.75      0.75      0.75     97632


Confusion Matrix:
[[35300 13516]
 [10611 38205]]
Accuracy: 0.7529
Precision: 0.7538
Recall: 0.7529
f1-score: 0.7526


###Hyperparameter tuning with 5-Fold Cross-Validation

In [83]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [84]:
# Hyperparameter Tuning for Logistic Regression for cleaned dataset
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['lbfgs', 'liblinear'],  # Different solvers for comparison
    'max_iter': [500, 1000, 1500]
}

grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_log_reg_model = grid_search.best_estimator_
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Train the best model on the full training set
best_log_reg_model.fit(X_train, y_train)

# Predict using the tuned model
y_pred_tuned = best_log_reg_model.predict(X_test)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best Hyperparameters: {'C': 1, 'max_iter': 500, 'solver': 'liblinear'}
Best Cross-Validation Score: 0.7579796427885539


####Evaluation of Tuned Model

In [85]:
# Evaluate tuned model for cleaned dataset
print("\nTuned Model Classification Report:")
print(classification_report(y_test,y_pred_tuned))

print("\nTuned Model Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_tuned))

# Compute evaluation metrics for the tuned model
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
precision_tuned = precision_score(y_test, y_pred_tuned, average='weighted')
recall_tuned = recall_score(y_test, y_pred_tuned, average='weighted')
f1_tuned = f1_score(y_test, y_pred_tuned, average='weighted')

# Print scores
print(f"Tuned Model Accuracy: {accuracy_tuned:.4f}")
print(f"Tuned Model Precision: {precision_tuned:.4f}")
print(f"Tuned Model Recall: {recall_tuned:.4f}")
print(f"Tuned Model F1-Score: {f1_tuned:.4f}")


Tuned Model Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.73      0.76      9829
           1       0.75      0.79      0.77      9698

    accuracy                           0.76     19527
   macro avg       0.76      0.76      0.76     19527
weighted avg       0.76      0.76      0.76     19527


Tuned Model Confusion Matrix:
[[7223 2606]
 [2056 7642]]
Tuned Model Accuracy: 0.7613
Tuned Model Precision: 0.7622
Tuned Model Recall: 0.7613
Tuned Model F1-Score: 0.7611


###Exporting Model

In [86]:
import os
import json
import shutil
import pickle

# Save Model to Drive
def save_model_to_drive(model, vectorizer, model_name, metrics=None):
    # Mount Google Drive
    drive.mount("/content/drive", force_remount=True)

    # Paths
    local_path = f"./{model_name}"
    drive_base_path = "/content/drive/My Drive/IT1244_Team1_Project/Model & Dataset"
    drive_path = os.path.join(drive_base_path, model_name)

    # Create local directory if it doesn't exist
    if not os.path.exists(local_path):
        os.makedirs(local_path)

    # Save model and vectorizer locally
    model_filename = os.path.join(local_path, "model.pkl")
    vectorizer_filename = os.path.join(local_path, "cleaned_deduplicated_vectorizer.pkl")
    with open(model_filename, "wb") as model_file:
        pickle.dump(model, model_file)
    with open(vectorizer_filename, "wb") as vectorizer_file:
        pickle.dump(vectorizer, vectorizer_file)

    # Save metrics to local folder
    if metrics:
        with open(os.path.join(local_path, "cleaned_deduplicated_metrics.json"), "w") as f:
            json.dump(metrics, f, indent=4)

    # Remove old copy in Google Drive (if exists)
    if os.path.exists(drive_path):
        shutil.rmtree(drive_path)

    # Copy to Google Drive
    shutil.copytree(local_path, drive_path)
    print(f"Model and metrics saved to Google Drive at: {drive_path}")

# Metrics for Logistic Regression
metrics_log_reg = {
    "accuracy": accuracy_log_reg,
    "cross_val_accuracy": accuracy_cv_log_reg,
    "classification_report": classification_report(y_test, y_pred, output_dict=True),
    "confusion_matrix": confusion_matrix(y_test, y_pred).tolist()
}

save_model_to_drive(log_reg_model, vectorizer, "Logistic Regression", metrics=metrics_log_reg)

Mounted at /content/drive
Model and metrics saved to Google Drive at: /content/drive/My Drive/IT1244_Team1_Project/Model & Dataset/Logistic Regression
