In [None]:
from google.colab import files
uploaded = files.upload()


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


df = pd.read_csv('Twitter_Data.csv')

print("📌 First 5 rows:")
display(df.head())

# 5. Show Actual Column Names
print("📋 Columns in your dataset:")
print(df.columns.tolist())

# 6. Check Missing Values
print("🔍 Missing Values:")
print(df.isnull().sum())

# 7. Basic Distribution
print("\n📊 Sentiment Distribution:")
print(df['Sentiment'].value_counts())
sns.countplot(data=df, x='Sentiment')
plt.title("Sentiment Distribution")
plt.show()

# 8. Text Preprocessing Function
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www.\S+', '', text)  # remove links
    text = re.sub(r'@\w+', '', text)  # remove mentions
    text = re.sub(r'#\w+', '', text)  # remove hashtags
    text = re.sub(r'[^A-Za-z\s]', '', text)  # remove special characters
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

# 9. Apply Cleaning to the 'Tweet' Column
df['clean_text'] = df['Tweet'].apply(clean_text)

# 10. TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df['clean_text'])
y = df['Sentiment']

# 11. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 12. Model Training - Naive Bayes
model = MultinomialNB()
model.fit(X_train, y_train)

# 13. Predictions and Evaluation
y_pred = model.predict(X_test)
print("\n📈 Classification Report:\n", classification_report(y_test, y_pred))
print("✅ Accuracy:", accuracy_score(y_test, y_pred))

# 14. Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', xticklabels=model.classes_, yticklabels=model.classes_, cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


Saving Twitter_Data.csv to Twitter_Data.csv


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


📌 First 5 rows:


Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


📋 Columns in your dataset:
['clean_text', 'category']
🔍 Missing Values:
clean_text    4
category      7
dtype: int64

📊 Sentiment Distribution:


KeyError: 'Sentiment'