In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [5]:
df = pd.read_csv('/content/Preprocessed_data.csv')
df.head()

Unnamed: 0,id,original_text,text,listed_emotions,emotion_count,labels,encoded_labels
0,eew5j0j,That game hurt.,game hurt,sadness,1,negative,0
1,ed2mah1,"You do right, if you don't care then fuck 'em!",right care fuck em,neutral,1,neutral,1
2,eeibobj,Man I love reddit.,man love reddit,love,1,positive,2
3,eda6yn6,"[NAME] was nowhere near them, he was by the Fa...",nowhere near falcon,neutral,1,neutral,1
4,eespn2i,Right? Considering it’s such an important docu...,right considering important document know damn...,gratitude,1,positive,2


In [6]:
df.isna().sum()

id                   0
original_text        0
text               787
listed_emotions      0
emotion_count        0
labels               0
encoded_labels       0
dtype: int64

It appears that the original dataframe had texts that were entirely made of stopwords, so after cleaning, the rows with such texts were left with missing values, therefore we proceed with dropping the missing values.

In [7]:
df.dropna(inplace = True)

In [8]:
final_df = df[['text', 'encoded_labels']]
final_df.head()

Unnamed: 0,text,encoded_labels
0,game hurt,0
1,right care fuck em,1
2,man love reddit,2
3,nowhere near falcon,1
4,right considering important document know damn...,2


In [9]:
import pandas as pd

# Load AFINN lexicon into a Python dictionary
def load_afinn_lexicon(file_path):
    afinn = {}
    with open(file_path, 'r') as file:
        for line in file:
            term, score = line.strip().split('\t')
            afinn[term] = int(score)
    return afinn

# Function to calculate the sentiment score of a sentence using the AFINN lexicon
def calculate_sentiment_score(sentence, afinn_lexicon):
    words = sentence.split()
    sentiment_score = sum(afinn_lexicon.get(word, 0) for word in words)
    return sentiment_score

# Load the AFINN lexicon
afinn_file_path = '/content/AFINN-en-165.txt'
afinn_lexicon = load_afinn_lexicon(afinn_file_path)

# Assuming 'final_df' is your DataFrame with a 'text' column
# Create a new column 'sentiment_score' to store the sentiment score for each text
final_df['sentiment_score'] = final_df['text'].apply(lambda x: calculate_sentiment_score(x, afinn_lexicon))

# Create a new column 'sentiment_label' based on the sentiment score
final_df.loc[:, 'sentiment_label'] = final_df['sentiment_score'].apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutral')

# Display the DataFrame with the newly added features
print(final_df)




                                                     text  encoded_labels  \
0                                               game hurt               0   
1                                      right care fuck em               1   
2                                         man love reddit               2   
3                                     nowhere near falcon               1   
4       right considering important document know damn...               2   
...                                                   ...             ...   
207809  well glad awful way act make think healthy bou...               2   
207810                                      everyone like               2   
207811  well youve imported gazillion country get serious               2   
207812                                       look amazing               2   
207813  fda plenty criticize like usually criticized h...               0   

        sentiment_score sentiment_label  
0                    -2        ne

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['sentiment_score'] = final_df['text'].apply(lambda x: calculate_sentiment_score(x, afinn_lexicon))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.loc[:, 'sentiment_label'] = final_df['sentiment_score'].apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutral')


In [10]:
# Using SGDClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier  # Import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Preprocess the text (if needed) and split the data into training and testing sets
X_text = final_df['text']
X_sentiment = final_df['sentiment_score']  # Use the sentiment scores as additional features
y = final_df['sentiment_label']

# Split the data into 70% training and 30% testing
X_text_train, X_text_test, X_sentiment_train, X_sentiment_test, y_train, y_test = train_test_split(
    X_text, X_sentiment, y, test_size=0.3, random_state=42
)

# Vectorize the text data using TF-IDF (Term Frequency-Inverse Document Frequency) vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust the number of features as needed
X_text_train_tfidf = tfidf_vectorizer.fit_transform(X_text_train)
X_text_test_tfidf = tfidf_vectorizer.transform(X_text_test)

# Combine TF-IDF features with sentiment scores as additional features
import scipy.sparse as sp
X_train = sp.hstack([X_text_train_tfidf, X_sentiment_train.values.reshape(-1, 1)], format='csr')
X_test = sp.hstack([X_text_test_tfidf, X_sentiment_test.values.reshape(-1, 1)], format='csr')

# Train an SGD classifier (instead of SVM)
sgd_classifier = SGDClassifier(loss='hinge', random_state=42)  # Use hinge loss for linear SVM
sgd_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = sgd_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

classification_report_result = classification_report(y_test, y_pred)
print("Classification Report:\n", classification_report_result)


Accuracy: 0.97
Precision: 0.97
Recall: 0.97
F1-Score: 0.97
Classification Report:
               precision    recall  f1-score   support

    negative       0.90      1.00      0.95     18619
     neutral       1.00      0.88      0.93     16035
    positive       1.00      1.00      1.00     27455

    accuracy                           0.97     62109
   macro avg       0.97      0.96      0.96     62109
weighted avg       0.97      0.97      0.97     62109



In [11]:
final_df['sentiment_label'].value_counts()

positive    91563
negative    61863
neutral     53601
Name: sentiment_label, dtype: int64

In [12]:
# Addressing class imbalance using SMOTE
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE

# Preprocess the text (if needed) and split the data into training and testing sets
X_text = final_df['text']
X_sentiment = final_df['sentiment_score']  # Use the sentiment scores as additional features
y = final_df['sentiment_label']

# Split the data into 70% training and 30% testing
X_text_train, X_text_test, X_sentiment_train, X_sentiment_test, y_train, y_test = train_test_split(
    X_text, X_sentiment, y, test_size=0.3, random_state=42
)

# Vectorize the text data using TF-IDF (Term Frequency-Inverse Document Frequency) vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust the number of features as needed
X_text_train_tfidf = tfidf_vectorizer.fit_transform(X_text_train)
X_text_test_tfidf = tfidf_vectorizer.transform(X_text_test)

# Combine TF-IDF features with sentiment scores as additional features
import scipy.sparse as sp
X_train = sp.hstack([X_text_train_tfidf, X_sentiment_train.values.reshape(-1, 1)], format='csr')
X_test = sp.hstack([X_text_test_tfidf, X_sentiment_test.values.reshape(-1, 1)], format='csr')

# Apply SMOTE to handle class imbalances
sm = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)

# Train an SGD classifier (instead of SVM) on the resampled data
sgd_classifier = SGDClassifier(loss='hinge', random_state=42)  # Use hinge loss for linear SVM
sgd_classifier.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = sgd_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

classification_report_result = classification_report(y_test, y_pred)
print("Classification Report:\n", classification_report_result)


Accuracy: 0.97
Precision: 0.97
Recall: 0.97
F1-Score: 0.97
Classification Report:
               precision    recall  f1-score   support

    negative       0.91      1.00      0.95     18619
     neutral       0.98      0.88      0.93     16035
    positive       1.00      0.99      1.00     27455

    accuracy                           0.97     62109
   macro avg       0.96      0.96      0.96     62109
weighted avg       0.97      0.97      0.97     62109



In [14]:
import joblib
# Save the trained model as a pickle file
model_filename = 'sentiment_classifier_model.pkl'
joblib.dump(sgd_classifier, model_filename)

print("Model saved as", model_filename)

Model saved as sentiment_classifier_model.pkl
