In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, precision_recall_curve
from imblearn.over_sampling import SMOTE

In [4]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,count,hate_speech_count,offensive_language_count,neither_count,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [5]:
df = pd.read_csv('train_balanced.csv')
df.head()

Unnamed: 0,class,labels,cleaned_tweet,sentiment_score,weight
0,0,negative,roll blunt well bitch havana cause king round,-0.4019,3.0
1,0,negative,boat n hoe boat n hoe get ta boat n hoe,0.0,1.5
2,0,negative,loud jamaican horn u still single yall find ug...,-0.2263,1.5
3,0,negative,officially dub nasty un natural shade red look...,0.2023,1.5
4,0,negative,nothing bitch,0.4717,3.0


In [6]:
# Preprocessed data
X = df['cleaned_tweet']
y = df['class']

# Convert text to numerical features using TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
X_vectorized = vectorizer.fit_transform(X)

print(X_vectorized)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 29963 stored elements and shape (8117, 500)>
  Coords	Values
  (0, 36)	0.24778630928556977
  (0, 58)	0.6066508611015093
  (0, 218)	0.7553652610859419
  (1, 191)	0.8729563454457565
  (1, 414)	0.48779833840019265
  (2, 191)	0.3347449223231464
  (2, 376)	0.6916999245346459
  (2, 489)	0.6399195663344229
  (3, 339)	0.8135718921665095
  (3, 244)	0.31661503462869045
  (3, 162)	0.344000136974265
  (3, 156)	0.3457160711991353
  (4, 36)	1.0
  (5, 36)	0.19474564322483384
  (5, 449)	0.9808537783201661
  (6, 191)	0.21975847208330476
  (6, 170)	0.8187447885882279
  (6, 284)	0.27333474356248777
  (6, 461)	0.39537800997156897
  (6, 233)	0.22433798675164285
  (7, 36)	1.0
  (8, 36)	0.1494557184111953
  (8, 58)	0.36590980563903375
  (8, 244)	0.28833780569451345
  (8, 162)	0.3132771151252953
  :	:
  (8112, 89)	0.528920200020294
  (8112, 190)	0.4599717590773963
  (8113, 242)	0.19520642305442354
  (8113, 89)	0.28923262858473053
  (8113, 279)	0.23

In [7]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Handle class imbalance using SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Set up parameter grid for Random Forest tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform GridSearchCV to find best hyperparameters
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train_resampled, y_train_resampled)

# Best model after GridSearchCV
best_rf = grid_search.best_estimator_

print(best_rf)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
RandomForestClassifier(min_samples_split=10, n_estimators=200, random_state=42)


In [8]:
# Evaluate with Stratified K-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_index, test_index in skf.split(X_vectorized, y):
    X_train_fold, X_test_fold = X_vectorized[train_index], X_vectorized[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
    
    # Resample for each fold to balance classes
    X_train_resampled_fold, y_train_resampled_fold = smote.fit_resample(X_train_fold, y_train_fold)
    
    # Train model on this fold
    best_rf.fit(X_train_resampled_fold, y_train_resampled_fold)
    
    # Evaluate the model
    y_pred_fold = best_rf.predict(X_test_fold)
    print("Fold Accuracy:", accuracy_score(y_test_fold, y_pred_fold))
    print(classification_report(y_test_fold, y_pred_fold))

# Final evaluation on the test set
y_pred = best_rf.predict(X_test)
print("Final Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Step 1: Get predicted probabilities for the positive class (class 1)
y_probs = best_rf.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Step 2: Calculate precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_probs)

# Step 3: Calculate F1-scores for each threshold
f1_scores = 2 * (precision * recall) / (precision + recall)

# Step 4: Find the best threshold (maximizing F1-score)
best_threshold = thresholds[np.argmax(f1_scores)]
print(f"Best Threshold: {best_threshold}")

# Step 5: Apply the best threshold to make predictions
y_pred_thresholded = (y_probs >= best_threshold).astype(int)

# Step 6: Evaluate performance with the new threshold
print("Final Accuracy with Optimized Threshold:", accuracy_score(y_test, y_pred_thresholded))
print(classification_report(y_test, y_pred_thresholded))

# You can also print specific metrics (precision, recall, F1) for class 1 (positive sentiment)
precision_class_1 = precision[np.argmax(f1_scores)]
recall_class_1 = recall[np.argmax(f1_scores)]
f1_class_1 = f1_scores[np.argmax(f1_scores)]

print(f"Optimized Precision for Class 1 (Positive): {precision_class_1}")
print(f"Optimized Recall for Class 1 (Positive): {recall_class_1}")
print(f"Optimized F1-Score for Class 1 (Positive): {f1_class_1}")

Fold Accuracy: 0.9451970443349754
              precision    recall  f1-score   support

           0       0.96      0.92      0.94       812
           1       0.93      0.97      0.95       812

    accuracy                           0.95      1624
   macro avg       0.95      0.95      0.95      1624
weighted avg       0.95      0.95      0.95      1624

Fold Accuracy: 0.9445812807881774
              precision    recall  f1-score   support

           0       0.96      0.92      0.94       812
           1       0.93      0.97      0.95       812

    accuracy                           0.94      1624
   macro avg       0.95      0.94      0.94      1624
weighted avg       0.95      0.94      0.94      1624

Fold Accuracy: 0.944547134935305
              precision    recall  f1-score   support

           0       0.95      0.93      0.94       812
           1       0.93      0.96      0.95       811

    accuracy                           0.94      1623
   macro avg       0.94    

In [22]:
import pickle  # To save/load model (optional)
import numpy as np

# Function to make predictions
def predict_sentiment(new_tweets, model, vectorizer, threshold=0.5):
    """
    Predicts sentiment (positive/negative) for a list of new tweets.

    :param new_tweets: List of tweet texts
    :param model: Trained RandomForest model
    :param vectorizer: Trained TfidfVectorizer
    :param threshold: Decision threshold (default = 0.5)
    :return: List of predicted sentiments
    """
    # Transform the new tweets using the trained TF-IDF vectorizer
    new_tweets_vectorized = vectorizer.transform(new_tweets)

    # Get probability predictions for the positive class (class 1)
    probabilities = model.predict_proba(new_tweets_vectorized)[:, 1]

    # Apply the decision threshold
    predictions = (probabilities >= threshold).astype(int)

    # Convert to readable labels
    sentiment_labels = ['Negative' if pred == 0 else 'Positive' for pred in predictions]
    
    return sentiment_labels

In [None]:
# Example tweet
example_tweet = ["you are so fucking annoying"]

# Predict sentiment
predicted_sentiment = predict_sentiment(example_tweet, best_rf, vectorizer, threshold=best_threshold)

# Print result
print(f"Tweet: {example_tweet[0]}")
print(f"Predicted Sentiment: {predicted_sentiment[0]}")


Tweet: cocksucker
Predicted Sentiment: Positive
