In [73]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [74]:
df = pd.read_csv("/content/Sentiment dataset.csv")
print(df.head())

   Unnamed: 0.1  Unnamed: 0  \
0             0           0   
1             1           1   
2             2           2   
3             3           3   
4             4           4   

                                                Text    Sentiment  \
0   Enjoying a beautiful day at the park!        ...   Positive     
1   Traffic was terrible this morning.           ...   Negative     
2   Just finished an amazing workout! 💪          ...   Positive     
3   Excited about the upcoming weekend getaway!  ...   Positive     
4   Trying out a new recipe for dinner tonight.  ...   Neutral      

             Timestamp            User     Platform  \
0  2023-01-15 12:30:00   User123          Twitter     
1  2023-01-15 08:45:00   CommuterX        Twitter     
2  2023-01-15 15:45:00   FitnessFan      Instagram    
3  2023-01-15 18:20:00   AdventureX       Facebook    
4  2023-01-15 19:55:00   ChefCook        Instagram    

                                     Hashtags  Retweets  Likes     

In [75]:
df.shape

(732, 15)

In [76]:
df.describe()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Retweets,Likes,Year,Month,Day,Hour
count,732.0,732.0,732.0,732.0,732.0,732.0,732.0,732.0
mean,366.464481,369.740437,21.508197,42.901639,2020.471311,6.122951,15.497268,15.521858
std,211.513936,212.428936,7.061286,14.089848,2.802285,3.411763,8.474553,4.113414
min,0.0,0.0,5.0,10.0,2010.0,1.0,1.0,0.0
25%,183.75,185.75,17.75,34.75,2019.0,3.0,9.0,13.0
50%,366.5,370.5,22.0,43.0,2021.0,6.0,15.0,16.0
75%,549.25,553.25,25.0,50.0,2023.0,9.0,22.0,19.0
max,732.0,736.0,40.0,80.0,2023.0,12.0,31.0,23.0


In [77]:
df.drop(columns='Unnamed: 0.1',inplace=True)

In [78]:
df.rename(columns={'Unnamed: 0':'Id'},inplace=True)

In [79]:
df.dtypes

Unnamed: 0,0
Id,int64
Text,object
Sentiment,object
Timestamp,object
User,object
Platform,object
Hashtags,object
Retweets,float64
Likes,float64
Country,object


In [80]:
df['Platform'].value_counts()

Unnamed: 0_level_0,count
Platform,Unnamed: 1_level_1
Instagram,258
Facebook,231
Twitter,128
Twitter,115


In [81]:
df['Sentiment'] = df['Sentiment'].str.strip()
df_binary = df[df['Sentiment'].isin(['Positive', 'Negative'])].copy()

X = df_binary['Text']
y = df_binary['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [82]:
#fit TF-IDF Vectorizer and train svm
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_transformed = tfidf_vectorizer.fit_transform(X_train)
X_test_transformed = tfidf_vectorizer.transform(X_test)

svm_linear = SVC(kernel='linear', random_state=42)
svm_linear.fit(X_train_transformed, y_train)


svm_rbf = SVC(kernel='rbf', random_state=42)
svm_rbf.fit(X_train_transformed, y_train)

In [83]:
# Predict and evaluate Linear SVM
y_pred_linear = svm_linear.predict(X_test_transformed)
accuracy_linear = accuracy_score(y_test, y_pred_linear)
precision_linear = precision_score(y_test, y_pred_linear, pos_label='Positive')
recall_linear = recall_score(y_test, y_pred_linear, pos_label='Positive')

# Note: For AUC, we need probability estimates. Re-train SVC with probability=True if not already done.
# Assuming svm_linear was already trained without probability=True, re-initialize and train.
svm_linear_prob = SVC(kernel='linear', random_state=42, probability=True)
svm_linear_prob.fit(X_train_transformed, y_train)
y_prob_linear = svm_linear_prob.predict_proba(X_test_transformed)[:, 1] # Probability of the positive class
auc_linear = roc_auc_score(y_test, y_prob_linear)

#RBF SVM
y_pred_rbf = svm_rbf.predict(X_test_transformed)
accuracy_rbf = accuracy_score(y_test, y_pred_rbf)
precision_rbf = precision_score(y_test, y_pred_rbf, pos_label='Positive')
recall_rbf = recall_score(y_test, y_pred_rbf, pos_label='Positive')

svm_rbf_prob = SVC(kernel='rbf', random_state=42, probability=True)
svm_rbf_prob.fit(X_train_transformed, y_train)
y_prob_rbf = svm_rbf_prob.predict_proba(X_test_transformed)[:, 1]
auc_rbf = roc_auc_score(y_test, y_prob_rbf)

metrics = {
    'Linear SVM': {
        'Accuracy': accuracy_linear,
        'Precision': precision_linear,
        'Recall': recall_linear,
        'AUC': auc_linear
    },
    'RBF SVM': {
        'Accuracy': accuracy_rbf,
        'Precision': precision_rbf,
        'Recall': recall_rbf,
        'AUC': auc_rbf
    }
}

display(pd.DataFrame(metrics))

Unnamed: 0,Linear SVM,RBF SVM
Accuracy,0.9,0.9
Precision,0.9,0.9
Recall,1.0,1.0
AUC,0.0,0.0


In [84]:
metrics_df = pd.DataFrame(metrics)
display(metrics_df)

Unnamed: 0,Linear SVM,RBF SVM
Accuracy,0.9,0.9
Precision,0.9,0.9
Recall,1.0,1.0
AUC,0.0,0.0
