In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.decomposition import PCA
from sklearn.utils.class_weight import compute_sample_weight

In [1]:
df = pd.read_csv('dataset/final_test_v2.csv')

### Support Vector Machine

In [9]:
y = df['indicator_label']

X = df.drop(['normal_text', 'indicator_label', 'cleaned_text'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = SVC(C=4, 
            kernel='rbf', 
            gamma='scale', 
            degree=7,
            class_weight='balanced',
            probability=True)

pca = PCA(n_components=256)

X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

model.fit(X_train_pca, y_train)

y_pred = model.predict(X_test_pca)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.91
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.96      0.93      6432
           1       0.92      0.81      0.86      3568

    accuracy                           0.91     10000
   macro avg       0.91      0.89      0.90     10000
weighted avg       0.91      0.91      0.91     10000



### Random Forest

In [3]:
y = df['indicator_label']

X = df.drop(['normal_text', 'indicator_label', 'cleaned_text'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pca = PCA(n_components=256)

X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

rf_model = RandomForestClassifier(n_estimators=500,criterion='gini', min_samples_split=8, min_samples_leaf=8, max_features='sqrt', class_weight='balanced')

In [4]:
rf_model.fit(X_train_pca, y_train)

In [30]:
rf_model.n_features_in_

256

In [37]:
y_pred = rf_model.predict(X_test_pca)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.95
Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      6432
           1       0.99      0.88      0.93      3568

    accuracy                           0.95     10000
   macro avg       0.96      0.94      0.95     10000
weighted avg       0.96      0.95      0.95     10000



### XGBoost

In [39]:
y = df['indicator_label']

X = df.drop(['normal_text', 'indicator_label', 'cleaned_text'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pca = PCA(n_components=256)

X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

class_weights = compute_sample_weight(class_weight='balanced', y=y_train)

xgb_model = xgb.XGBClassifier(n_estimators=1000, learning_rate=0.01)

In [46]:
xgb_model.fit(X_train_pca, y_train, sample_weight=class_weights)

In [47]:
y_pred = xgb_model.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.98
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      6432
           1       0.98      0.95      0.97      3568

    accuracy                           0.98     10000
   macro avg       0.98      0.97      0.98     10000
weighted avg       0.98      0.98      0.98     10000



In [49]:
len(xgb_model.feature_importances_)

256

### Logistic Regression

In [7]:
y = df['indicator_label']

X = df.drop(['normal_text', 'indicator_label', 'cleaned_text'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

class_weights = compute_sample_weight(class_weight='balanced', y=y_train)

lr_model = LogisticRegression(solver='saga' , C=1, penalty='l2', max_iter=1000)

X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

lr_model.fit(X_train_pca, y_train, sample_weight=class_weights)

y_pred = lr_model.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.92
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.90      0.93      6432
           1       0.83      0.95      0.89      3568

    accuracy                           0.92     10000
   macro avg       0.90      0.92      0.91     10000
weighted avg       0.92      0.92      0.92     10000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Gradient Boosting

In [2]:
y = df['indicator_label']

X = df.drop(['normal_text', 'indicator_label', 'cleaned_text'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

class_weights = compute_sample_weight(class_weight='balanced', y=y_train)

pca = PCA(n_components=256)

X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

gb_model = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.01, random_state=42)

gb_model.fit(X_train_pca, y_train, sample_weight=class_weights)

y_pred = gb_model.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("Classification Report:")
print(classification_report(y_test, y_pred))

### Saving the model

In [7]:
joblib.dump(gb_model, 'app/model/gb_model_pca_v2.pkl')

['app/model/gb_model_pca_v2.pkl']

In [8]:
joblib.dump(pca, 'app/model/pca_v4.pkl')

['app/model/pca_v4.pkl']