# Import Necessary Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import re
import os
import joblib
import nltk
from nltk.corpus import stopwords

# Download stopwords for text cleaning
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ac\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Load and Combine Datasets

In [22]:
# Load datasets
happiness = pd.read_csv("happiness.csv")
angriness = pd.read_csv("angriness.csv")
sadness = pd.read_csv("sadness.csv")

# Display datasets
print(happiness.head())
print(angriness.head())
print(sadness.head())


                                             content  intensity
0  Wants to know how the hell I can remember word...  happiness
1  Love is a long sweet dream & marriage is an al...  happiness
2  The world could be amazing when you are slight...  happiness
3  My secret talent is getting tired without doin...  happiness
4  Khatarnaak Whatsapp Status Ever… Can\’t talk, ...  happiness
                                             content  intensity
0  Sometimes I’m not angry, I’m hurt and there’s ...  angriness
1                     Not available for busy people☺  angriness
2  I do not exist to impress the world. I exist t...  angriness
3  Everything is getting expensive except some pe...  angriness
4       My phone screen is brighter than my future 🙁  angriness
                                             content intensity
0  Never hurt people who love you a lot, because ...   sadness
1  Don’t expect me to tell you what you did wrong...   sadness
2  I preferred walking away than fighting f

## Combining datasets

In [23]:
# Combine the datasets into one DataFrame
data = pd.concat([happiness, angriness, sadness], ignore_index=True)

# Display the first few rows of the combined dataset
print(data.head())

                                             content  intensity
0  Wants to know how the hell I can remember word...  happiness
1  Love is a long sweet dream & marriage is an al...  happiness
2  The world could be amazing when you are slight...  happiness
3  My secret talent is getting tired without doin...  happiness
4  Khatarnaak Whatsapp Status Ever… Can\’t talk, ...  happiness


# Data Preprocessing

In [3]:
# Function to clean text by making it lowercase and removing special characters
def clean_text(text):
    text = text.lower()  # Make all letters lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove everything except letters and spaces
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stop words
    return text

# Apply the cleaning function to the 'content' column
stop_words = set(stopwords.words('english'))
data['content_clean'] = data['content'].apply(clean_text)

# Split Data into Training and Testing Sets
X_train_text, X_test_text, y_train, y_test = train_test_split(
    data['content_clean'], data['intensity'], test_size=0.2, random_state=42
)


# TF-IDF Vectorization

In [5]:
# Convert the cleaned text data into numerical features using unigrams, bigrams, and trigrams
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 3))
X_train_tfidf = tfidf.fit_transform(X_train_text)  # Fit on training data
X_test_tfidf = tfidf.transform(X_test_text)  # Transform the test data


# Model Selection and Training

## Support Vector Machine (SVM)

In [10]:
# SVM Model Training
svm_model = SVC(C=1, kernel='rbf')  # 'C' is the regularization parameter
svm_model.fit(X_train_tfidf, y_train)  # Train the model on the training data

# Model Evaluation
y_pred_svm = svm_model.predict(X_test_tfidf)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
report_svm = classification_report(y_test, y_pred_svm)

print(f"SVM Test Accuracy: {accuracy_svm}")
print(f"SVM Classification Report:\n{report_svm}")

SVM Test Accuracy: 0.7916666666666666
SVM Classification Report:
              precision    recall  f1-score   support

   angriness       0.85      0.81      0.83       133
   happiness       0.70      0.88      0.78       156
     sadness       0.92      0.65      0.76       119

    accuracy                           0.79       408
   macro avg       0.82      0.78      0.79       408
weighted avg       0.81      0.79      0.79       408



####  Grid Search for SVM Hyperparameters

In [11]:
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto']
}
grid_svm = GridSearchCV(SVC(), param_grid_svm, refit=True, verbose=1)
grid_svm.fit(X_train_tfidf, y_train)
print(f"Best SVM parameters: {grid_svm.best_params_}")

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best SVM parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'sigmoid'}


## Logistic Regression

In [12]:
# Logistic Regression Model Training
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_tfidf, y_train)
y_pred_log = log_reg.predict(X_test_tfidf)
accuracy_log = accuracy_score(y_test, y_pred_log)

print(f"Logistic Regression Test Accuracy: {accuracy_log}")


Logistic Regression Test Accuracy: 0.7916666666666666


## Naive Bayes

In [13]:
# Naive Bayes Model Training
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
y_pred_nb = nb_model.predict(X_test_tfidf)
accuracy_nb = accuracy_score(y_test, y_pred_nb)

print(f"Naive Bayes Test Accuracy: {accuracy_nb}")

Naive Bayes Test Accuracy: 0.6936274509803921


## Random Forest

In [14]:
# Random Forest Model Training
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train_tfidf, y_train)
y_pred_rf = rf_model.predict(X_test_tfidf)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

print(f"Random Forest Test Accuracy: {accuracy_rf}")


Random Forest Test Accuracy: 0.7867647058823529


####  Grid Search for Random Forest Hyperparameters

In [15]:
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=3)
grid_rf.fit(X_train_tfidf, y_train)
print(f"Best Random Forest parameters: {grid_rf.best_params_}")


Best Random Forest parameters: {'max_depth': 30, 'min_samples_split': 2, 'n_estimators': 200}


## Gradient Boosting

In [16]:
# Gradient Boosting Model Training
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train_tfidf, y_train)
y_pred_gb = gb_model.predict(X_test_tfidf)
accuracy_gb = accuracy_score(y_test, y_pred_gb)

print(f"Gradient Boosting Test Accuracy: {accuracy_gb}")

Gradient Boosting Test Accuracy: 0.7230392156862745


## Voting Classifier

In [17]:
# Voting Classifier Training
voting_clf = VotingClassifier(estimators=[
    ('svm', svm_model), 
    ('rf', rf_model), 
    ('gb', gb_model)
], voting='hard')
voting_clf.fit(X_train_tfidf, y_train)
y_pred_voting = voting_clf.predict(X_test_tfidf)
accuracy_voting = accuracy_score(y_test, y_pred_voting)

print(f"Voting Classifier Test Accuracy: {accuracy_voting}")


Voting Classifier Test Accuracy: 0.8063725490196079


# Top two Model Saving and Loading

In [41]:
import joblib

# Save the Voting Classifier model
joblib.dump(voting_clf, 'models/voting_classifier_model.pkl')

# Save the SVM model
joblib.dump(svm_model, 'models/svm_model.pkl')

# Verify the files are saved
print("Voting Classifier model saved as 'voting_classifier_model.pkl'")
print("SVM model saved as 'svm_model.pkl'")


Voting Classifier model saved as 'voting_classifier_model.pkl'
SVM model saved as 'svm_model.pkl'


# Test Predictions

In [42]:
# Input text to test the model
test_text = ["I am feeling extremely happy today!"]

# Preprocess the input using the loaded TF-IDF vectorizer
test_text_tfidf = tfidf_vectorizer_loaded.transform(test_text)

# Make predictions using the loaded models
prediction_svm = svm_model_loaded.predict(test_text_tfidf)

prediction_voting = voting_clf_loaded.predict(test_text_tfidf)

# Print the predicted classes
print(f"Predicted Intensity (SVM): {prediction_svm[0]}")

print(f"Predicted Intensity (Voting Classifier): {prediction_voting[0]}")


Predicted Intensity (SVM): happiness
Predicted Intensity (Voting Classifier): happiness


### END