#**Maximum Entropy Classifier**

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

##For cleaned datset

###Importing Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')
data_path='/content/drive/My Drive/IT1244_Team1_Project/Model & Dataset/ml_cleaned.csv'

Mounted at /content/drive


In [None]:
#importing dataset
df=pd.read_csv(data_path)
df.head()

Unnamed: 0,label,tweet,clean_tweet,lemmatized_tweet,tweet_length
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww that bummer shoulda david carr third,awww bummer shoulda david carr third,6
1,0,is upset that he can't update his Facebook by ...,upset that cannot update facebook texting migh...,upset update facebook texting might result sch...,10
2,0,@Kenichan I dived many times for the ball. Man...,dived many times ball managed save rest bounds,dived many time ball managed save rest bound,8
3,0,my whole body feels itchy and like its on fire,whole body feels itchy like fire,whole body feel itchy like fire,6
4,0,"@nationwideclass no, it's not behaving at all....",behaving here because cannot over there,behaving,1


###Feature Extraction

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

####Perform TF-IDF Vectorization

In [None]:
def tfidf_vetorization(data,ngram_range=(1,2),max_features=10000,min_df=3,max_df=0.8):
    vectorizer=TfidfVectorizer(max_features=max_features,ngram_range=ngram_range,min_df=min_df,max_df=max_df)
    tfidf_matrix=vectorizer.fit_transform(data)
    return tfidf_matrix,vectorizer

####Split into train and test dataset

In [None]:
def train_test_splitting(X,y,test_size=0.2):
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=test_size,random_state=42)
    return X_train,X_test,y_train,y_test

In [None]:
df['lemmatized_tweet'] = df['lemmatized_tweet'].fillna("")
tfidf_matrix, vectorizer = tfidf_vetorization(df['lemmatized_tweet'])
X_train,X_test,y_train,y_test=train_test_splitting(tfidf_matrix,df['label'])
print(f"Train set shape:{X_train.shape}, Test set shape: {X_test.shape}")

Train set shape:(78944, 10000), Test set shape: (19737, 10000)


In [None]:
print(vectorizer.get_feature_names_out()[:20])

['aaaah' 'aaah' 'aaaw' 'aargh' 'aaron' 'abandoned' 'abby' 'aberdeen'
 'ability' 'abit' 'able' 'able make' 'able sleep' 'able take' 'abroad'
 'absent' 'absolute' 'absolutely' 'absolutely love' 'absolutely nothing']


###Maximum Entropy Classifier

In [None]:
#scikit learn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict

####Model Initialisation

In [None]:
#Maximum Entropy model using Logistic Regression (multinomial)
max_ent_model = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
max_ent_model.fit(X_train, y_train)

#predict on the test set
y_pred_max_ent = max_ent_model.predict(X_test)

####Evaluation of Initialised Model

In [None]:
#evaluate the model
accuracy_max_ent = accuracy_score(y_test, y_pred_max_ent)
print(f"Accuracy: {accuracy_max_ent:.4f}")

#classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_max_ent))

#confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_max_ent))

Accuracy: 0.7507

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.73      0.75      9900
           1       0.74      0.77      0.75      9837

    accuracy                           0.75     19737
   macro avg       0.75      0.75      0.75     19737
weighted avg       0.75      0.75      0.75     19737


Confusion Matrix:
[[7265 2635]
 [2286 7551]]


###5-Fold Cross-Validation of Maximum Entropy Classifier

In [None]:
#cross-validation
y_pred_cv_max_ent = cross_val_predict(max_ent_model, tfidf_matrix, df['label'], cv=5)

####Evaluation of Initialised Model with 5-Fold Cross-Validation

In [None]:
# Cross-validation classification report
print("\nCross-Validation Classification Report:")
print(classification_report(df['label'], y_pred_cv_max_ent))

# Cross-validation confusion matrix
print("\nCross-Validation Confusion Matrix:")
print(confusion_matrix(df['label'], y_pred_cv_max_ent))

# Calculate individual score for cross-validation
accuracy_cv_max_ent = accuracy_score(df['label'], y_pred_cv_max_ent)
print(f"Cross-Validation Accuracy: {accuracy_cv_max_ent:.4f}")

precision_cv_max_ent = cross_val_score(max_ent_model, tfidf_matrix, df['label'], cv=5, scoring='precision_weighted').mean()
print(f"Cross-Validation Precision: {precision_cv_max_ent:.4f}")

recall_cv_max_ent = cross_val_score(max_ent_model, tfidf_matrix, df['label'], cv=5, scoring='recall_weighted').mean()
print(f"Cross-Validation Recall: {recall_cv_max_ent:.4f}")

f1_cv_max_ent = cross_val_score(max_ent_model, tfidf_matrix, df['label'], cv=5, scoring='f1_weighted').mean()
print(f"Cross-Validation f1-score: {f1_cv_max_ent:.4f}")


Cross-Validation Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.72      0.73     49400
           1       0.73      0.76      0.74     49281

    accuracy                           0.74     98681
   macro avg       0.74      0.74      0.74     98681
weighted avg       0.74      0.74      0.74     98681


Cross-Validation Confusion Matrix:
[[35356 14044]
 [11847 37434]]
Cross-Validation Accuracy: 0.7376
Cross-Validation Precision: 0.7382
Cross-Validation Recall: 0.7376
Cross-Validation f1-score: 0.7375


###Hyperparameter Tuning with 5-Fold Cross-Validation

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
# Hyperparameter Tuning for Maximum Entropy Model
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['lbfgs','saga'],  # Solvers for multinomial classification
    'max_iter': [5000, 10000, 15000],  # Number of iterations
    'multi_class': ['multinomial']
}

grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_max_ent_model = grid_search.best_estimator_
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Train the best model on the full training set
best_max_ent_model.fit(X_train, y_train)

# Predict using the tuned model
y_pred_tuned = best_max_ent_model.predict(X_test)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best Hyperparameters: {'C': 1, 'max_iter': 10000, 'multi_class': 'multinomial', 'solver': 'saga'}
Best Cross-Validation Score: 0.7433116525292061


####Evaluation of Tuned Model

In [None]:
# Evaluate tuned model
print("\nTuned Model Classification Report:")
print(classification_report(y_test, y_pred_tuned))

print("\nTuned Model Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_tuned))

# Compute evaluation metrics for the tuned model
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
precision_tuned = precision_score(y_test, y_pred_tuned, average='weighted')
recall_tuned = recall_score(y_test, y_pred_tuned, average='weighted')
f1_tuned = f1_score(y_test, y_pred_tuned, average='weighted')

# Print scores
print(f"Tuned Model Accuracy: {accuracy_tuned:.4f}")
print(f"Tuned Model Precision: {precision_tuned:.4f}")
print(f"Tuned Model Recall: {recall_tuned:.4f}")
print(f"Tuned Model F1-Score: {f1_tuned:.4f}")


Tuned Model Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.73      0.75      9900
           1       0.74      0.77      0.75      9837

    accuracy                           0.75     19737
   macro avg       0.75      0.75      0.75     19737
weighted avg       0.75      0.75      0.75     19737


Tuned Model Confusion Matrix:
[[7251 2649]
 [2268 7569]]
Tuned Model Accuracy: 0.7509
Tuned Model Precision: 0.7513
Tuned Model Recall: 0.7509
Tuned Model F1-Score: 0.7508


###Exporting Model

In [None]:
import os
import json
import shutil
import pickle

# Save Model to Drive
def save_model_to_drive(model, vectorizer, model_name, metrics=None):
    # Mount Google Drive
    drive.mount("/content/drive", force_remount=True)

    # Paths
    local_path = f"./{model_name}"
    drive_base_path = "/content/drive/My Drive/IT1244_Team1_Project/Model & Dataset"
    drive_path = os.path.join(drive_base_path, model_name)

    # Create local directory if it doesn't exist
    if not os.path.exists(local_path):
        os.makedirs(local_path)

    # Save model and vectorizer locally
    model_filename = os.path.join(local_path, "model.pkl")
    vectorizer_filename = os.path.join(local_path, "cleaned_vectorizer.pkl")
    with open(model_filename, "wb") as model_file:
        pickle.dump(model, model_file)
    with open(vectorizer_filename, "wb") as vectorizer_file:
        pickle.dump(vectorizer, vectorizer_file)

    # Save metrics to local folder
    if metrics:
        with open(os.path.join(local_path, "cleaned_metrics.json"), "w") as f:
            json.dump(metrics, f, indent=4)

    # Remove old copy in Google Drive (if exists)
    if os.path.exists(drive_path):
        shutil.rmtree(drive_path)

    # Copy to Google Drive
    shutil.copytree(local_path, drive_path)
    print(f"Model and metrics saved to Google Drive at: {drive_path}")

# Metrics for Maximum Entropy
metrics_max_ent = {
    "accuracy": accuracy_max_ent,
    "cross_val_accuracy": accuracy_cv_max_ent,
    "classification_report": classification_report(y_test, y_pred_max_ent, output_dict=True),
    "confusion_matrix": confusion_matrix(y_test, y_pred_max_ent).tolist()
}
save_model_to_drive(max_ent_model, vectorizer, "Maximum Entropy", metrics=metrics_max_ent)

Mounted at /content/drive
Model and metrics saved to Google Drive at: /content/drive/My Drive/IT1244_Team1_Project/Model & Dataset/Maximum Entropy


##For cleaned and deduplicated dataset

### Importing Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')
data_path_cleaned_deduplicated='/content/drive/My Drive/IT1244_Team1_Project/Model & Dataset/ml_cleaned_deduplicated.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#importing dataset
df=pd.read_csv(data_path_cleaned_deduplicated)
df.head()

Unnamed: 0,label,tweet,clean_tweet,lemmatized_tweet,tweet_length
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww that bummer shoulda david carr third,awww bummer shoulda david carr third,6
1,0,is upset that he can't update his Facebook by ...,upset that cannot update facebook texting migh...,upset update facebook texting might result sch...,10
2,0,@Kenichan I dived many times for the ball. Man...,dived many times ball managed save rest bounds,dived many time ball managed save rest bound,8
3,0,my whole body feels itchy and like its on fire,whole body feels itchy like fire,whole body feel itchy like fire,6
4,0,"@nationwideclass no, it's not behaving at all....",behaving here because cannot over there,behaving,1


###Feature Extraction

####Perform TF-IDF Vectorization

In [None]:
def tfidf_vetorization(data,ngram_range=(1,2),max_features=10000,min_df=3,max_df=0.8):
    vectorizer=TfidfVectorizer(max_features=max_features,ngram_range=ngram_range,min_df=min_df,max_df=max_df)
    tfidf_matrix=vectorizer.fit_transform(data)
    return tfidf_matrix,vectorizer

####Split into train and test dataset

In [None]:
def train_test_splitting(X,y,test_size=0.2):
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=test_size,random_state=42)
    return X_train,X_test,y_train,y_test

In [None]:
df['lemmatized_tweet'] = df['lemmatized_tweet'].fillna("")
tfidf_matrix, vectorizer = tfidf_vetorization(df['lemmatized_tweet'])
X_train,X_test,y_train,y_test=train_test_splitting(tfidf_matrix,df['label'])
print(f"Train set shape:{X_train.shape}, Test set shape: {X_test.shape}")

Train set shape:(78105, 10000), Test set shape: (19527, 10000)


In [None]:
print(vectorizer.get_feature_names_out()[:20])

['aaah' 'aargh' 'aaron' 'abandoned' 'abby' 'ability' 'abit' 'able'
 'able make' 'able sleep' 'able work' 'abroad' 'absent' 'absolute'
 'absolutely' 'absolutely gorgeous' 'absolutely nothing' 'absolutly'
 'academy' 'accent']


###Maximum Entropy Classifier

In [None]:
#scikit learn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict

####Model Initialisation

In [None]:
#Maximum Entropy model using Logistic Regression (multinomial)
max_ent_model = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
max_ent_model.fit(X_train, y_train)

#predict on the test set
y_pred_max_ent = max_ent_model.predict(X_test)

####Evaluation of Initialised Model

In [None]:
#evaluate the model
accuracy_max_ent = accuracy_score(y_test, y_pred_max_ent)
print(f"Accuracy: {accuracy_max_ent:.4f}")

#classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_max_ent))

#confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_max_ent))

Accuracy: 0.7626

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.74      0.76      9829
           1       0.75      0.79      0.77      9698

    accuracy                           0.76     19527
   macro avg       0.76      0.76      0.76     19527
weighted avg       0.76      0.76      0.76     19527


Confusion Matrix:
[[7230 2599]
 [2036 7662]]


###5-Fold Cross-Validation of Maximum Entropy Classifier

In [None]:
#cross-validation
y_pred_cv_max_ent = cross_val_predict(max_ent_model, tfidf_matrix, df['label'], cv=5)

####Evaluation of Initialised Model with 5-Fold Cross-Validation

In [None]:
# Cross-validation classification report
print("\nCross-Validation Classification Report:")
print(classification_report(df['label'], y_pred_cv_max_ent))

# Cross-validation confusion matrix
print("\nCross-Validation Confusion Matrix:")
print(confusion_matrix(df['label'], y_pred_cv_max_ent))

# Calculate individual score for cross-validation
accuracy_cv_max_ent = accuracy_score(df['label'], y_pred_cv_max_ent)
print(f"Cross-Validation Accuracy: {accuracy_cv_max_ent:.4f}")

precision_cv_max_ent = cross_val_score(max_ent_model, tfidf_matrix, df['label'], cv=5, scoring='precision_weighted').mean()
print(f"Cross-Validation Precision: {precision_cv_max_ent:.4f}")

recall_cv_max_ent = cross_val_score(max_ent_model, tfidf_matrix, df['label'], cv=5, scoring='recall_weighted').mean()
print(f"Cross-Validation Recall: {recall_cv_max_ent:.4f}")

f1_cv_max_ent = cross_val_score(max_ent_model, tfidf_matrix, df['label'], cv=5, scoring='f1_weighted').mean()
print(f"Cross-Validation f1-score: {f1_cv_max_ent:.4f}")


Cross-Validation Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.72      0.75     48816
           1       0.74      0.78      0.76     48816

    accuracy                           0.75     97632
   macro avg       0.75      0.75      0.75     97632
weighted avg       0.75      0.75      0.75     97632


Cross-Validation Confusion Matrix:
[[35338 13478]
 [10632 38184]]
Cross-Validation Accuracy: 0.7531
Cross-Validation Precision: 0.7540
Cross-Validation Recall: 0.7531
Cross-Validation f1-score: 0.7528


###Hyperparameter Tuning with 5-Fold Cross-Validation

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
# Hyperparameter Tuning for Maximum Entropy Model
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['lbfgs','saga'],  # Solvers for multinomial classification
    'max_iter': [5000, 10000, 15000],  # Number of iterations
    'multi_class': ['multinomial']
}

grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_max_ent_model = grid_search.best_estimator_
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Train the best model on the full training set
best_max_ent_model.fit(X_train, y_train)

# Predict using the tuned model
y_pred_tuned = best_max_ent_model.predict(X_test)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best Hyperparameters: {'C': 1, 'max_iter': 10000, 'multi_class': 'multinomial', 'solver': 'saga'}
Best Cross-Validation Score: 0.7587350361692593


####Evaluation of Tuned Model

In [None]:
# Evaluate tuned model
print("\nTuned Model Classification Report:")
print(classification_report(y_test, y_pred_tuned))

print("\nTuned Model Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_tuned))

# Compute evaluation metrics for the tuned model
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
precision_tuned = precision_score(y_test, y_pred_tuned, average='weighted')
recall_tuned = recall_score(y_test, y_pred_tuned, average='weighted')
f1_tuned = f1_score(y_test, y_pred_tuned, average='weighted')

# Print scores
print(f"Tuned Model Accuracy: {accuracy_tuned:.4f}")
print(f"Tuned Model Precision: {precision_tuned:.4f}")
print(f"Tuned Model Recall: {recall_tuned:.4f}")
print(f"Tuned Model F1-Score: {f1_tuned:.4f}")


Tuned Model Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.74      0.76      9829
           1       0.75      0.79      0.77      9698

    accuracy                           0.76     19527
   macro avg       0.76      0.76      0.76     19527
weighted avg       0.76      0.76      0.76     19527


Tuned Model Confusion Matrix:
[[7232 2597]
 [2028 7670]]
Tuned Model Accuracy: 0.7631
Tuned Model Precision: 0.7641
Tuned Model Recall: 0.7631
Tuned Model F1-Score: 0.7630


###Exporting Model

In [None]:
import os
import json
import shutil
import pickle

# Save Model to Drive
def save_model_to_drive(model, vectorizer, model_name, metrics=None):
    # Mount Google Drive
    drive.mount("/content/drive", force_remount=True)

    # Paths
    local_path = f"./{model_name}"
    drive_base_path = "/content/drive/My Drive/IT1244_Team1_Project/Model & Dataset"
    drive_path = os.path.join(drive_base_path, model_name)

    # Create local directory if it doesn't exist
    if not os.path.exists(local_path):
        os.makedirs(local_path)

    # Save model and vectorizer locally
    model_filename = os.path.join(local_path, "model.pkl")
    vectorizer_filename = os.path.join(local_path, "cleaned_deduplicated_vectorizer.pkl")
    with open(model_filename, "wb") as model_file:
        pickle.dump(model, model_file)
    with open(vectorizer_filename, "wb") as vectorizer_file:
        pickle.dump(vectorizer, vectorizer_file)

    # Save metrics to local folder
    if metrics:
        with open(os.path.join(local_path, "cleaned_deduplicated_metrics.json"), "w") as f:
            json.dump(metrics, f, indent=4)

    # Remove old copy in Google Drive (if exists)
    if os.path.exists(drive_path):
        shutil.rmtree(drive_path)

    # Copy to Google Drive
    shutil.copytree(local_path, drive_path)
    print(f"Model and metrics saved to Google Drive at: {drive_path}")

# Metrics for Maximum Entropy
metrics_max_ent = {
    "accuracy": accuracy_max_ent,
    "cross_val_accuracy": accuracy_cv_max_ent,
    "classification_report": classification_report(y_test, y_pred_max_ent, output_dict=True),
    "confusion_matrix": confusion_matrix(y_test, y_pred_max_ent).tolist()
}
save_model_to_drive(max_ent_model, vectorizer, "Maximum Entropy", metrics=metrics_max_ent)

Mounted at /content/drive
Model and metrics saved to Google Drive at: /content/drive/My Drive/IT1244_Team1_Project/Model & Dataset/Maximum Entropy
