In [2]:
import pandas as pd

# 1. Load the dataset
# (Replace 'IMDB Dataset.csv' with your actual filename!)
df = pd.read_csv('Sentiment dataset.csv')

# 2. Check the first few rows
print("--- First 5 Reviews ---")
display(df.head())

# 3. Check dataset size
print(f"\nTotal Reviews: {len(df)}")

# 4. Check for missing values (Text data often has empty rows)
print("\n--- Missing Values ---")
print(df.isnull().sum())

--- First 5 Reviews ---


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour
0,0,0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,USA,2023,1,15,12
1,1,1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8
2,2,2,Just finished an amazing workout! ðŸ’ª ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,USA,2023,1,15,15
3,3,3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,UK,2023,1,15,18
4,4,4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:55:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,Australia,2023,1,15,19



Total Reviews: 732

--- Missing Values ---
Unnamed: 0.1    0
Unnamed: 0      0
Text            0
Sentiment       0
Timestamp       0
User            0
Platform        0
Hashtags        0
Retweets        0
Likes           0
Country         0
Year            0
Month           0
Day             0
Hour            0
dtype: int64


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

# 1. Load the Dataset
df = pd.read_csv('Sentiment dataset.csv')

# 2. Select only the necessary columns
# We only need the input text and the target label
df_clean = df[['Text', 'Sentiment']].copy()

# 3. Text Vectorization (The Magic Step)
# This converts text into a matrix of numbers
# stop_words='english' removes common words like 'the', 'is', 'and' that don't carry emotion
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)

# We fit on the raw text
X = tfidf.fit_transform(df_clean['Text'])
y = df_clean['Sentiment']

# 4. Split Data (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Train the SVM Model
# LinearSVC is a version of SVM optimized for text data
svm_model = LinearSVC(random_state=42)
svm_model.fit(X_train, y_train)

# 6. Evaluate
predictions = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)

print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("\n--- Classification Report ---")
print(classification_report(y_test, predictions))

# 7. Test it with your own sentence!
my_review = ["I absolutely loved this product, it was fantastic!"]
my_review_vec = tfidf.transform(my_review)
print(f"\nPrediction for custom review: {svm_model.predict(my_review_vec)[0]}")

Model Accuracy: 31.29%

--- Classification Report ---
                        precision    recall  f1-score   support

         Acceptance          0.00      0.00      0.00         2
      Acceptance             0.00      0.00      0.00         0
       Accomplishment        0.00      0.00      0.00         0
           Admiration        0.00      0.00      0.00         1
        Admiration           0.00      0.00      0.00         1
            Adventure        0.00      0.00      0.00         0
         Affection           1.00      1.00      1.00         1
      Ambivalence            1.00      1.00      1.00         1
         Anger               0.00      0.00      0.00         1
        Anticipation         0.00      0.00      0.00         1
            Anxiety          0.00      0.00      0.00         0
        Arousal              0.67      0.67      0.67         3
        ArtisticBurst        0.00      0.00      0.00         0
                  Awe        0.00      0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [4]:
# Check how many unique emotions are in the target column
print("Number of unique emotions:", df['Sentiment'].nunique())

# List the top 10 most common emotions
print("\n--- Top 10 Emotions ---")
print(df['Sentiment'].value_counts().head(10))

Number of unique emotions: 279

--- Top 10 Emotions ---
Sentiment
Positive        44
Joy             42
Excitement      32
Happy           14
Neutral         14
Contentment     14
Gratitude        9
Hopeful          9
Sad              9
Curiosity        8
Name: count, dtype: int64
