In [4]:
import pandas as pd 
import re 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import io

In [5]:
# step1 : load the raw data 

df_modi = pd.read_csv("../data/raw/Narendra Modi_data.csv")
df_rahul = pd.read_csv("../data/raw/Rahul Gandhi_data.csv")
df_kejriwal = pd.read_csv("../data/raw/Arvind Kejriwal_data.csv")

In [9]:
print(df_modi.head())
print(df_kejriwal.head())
print(df_rahul.head())

         Date             User  \
0  2022:10:19   QuestionsBotYT   
1  2022:10:19       PaperDabba   
2  2022:10:19   mnjworldcom123   
3  2022:10:19  BravePedestrian   
4  2022:10:19   NaMoPraveenKor   

                                               Tweet      Time  
0                        Is Narendra Modi a toaster?  23:57:08  
1  5G About To Bring Major Change, Will Revolutio...  23:56:38  
2  Prime Minister Shri Narendra Modi along with H...  23:51:02  
3  Bharat Mata has waited 5000 years for a true s...  23:40:58  
4  How Narendra Modi’s game-changing Gati Shakti ...  23:34:25  
         Date             User  \
0  2022:10:19        bhoo_sene   
1  2022:10:19  Madhusu88858324   
2  2022:10:19  PremshilaKumarp   
3  2022:10:19        lifebecom   
4  2022:10:19        sphavisha   

                                               Tweet      Time  
0  @TajinderBagga Aap leaders are speaking agains...  23:47:01  
1  Bjp Aap se sikh rhi h\nNarendra Modi Manish Si...  23:06:35  
2    

In [10]:
# combine dataframes
df = pd.concat([df_modi, df_rahul, df_kejriwal], ignore_index=True)
print("Combined Data Loaded. Total rows:", len(df))

Combined Data Loaded. Total rows: 210000


In [11]:
# STEP -2 initial preprocessing (cleaning the text)

def clean_tweet(text):
    text= str(text) # to make sure text is string 
    text=re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # Remove URLs
    text = re.sub(r'@\w+', '', text) # Remove mentions (@anurag)
    text = re.sub(r'RT[\s]+', '', text) # Remove Retweet tags
    text = re.sub(r'[^\w\s#]', '', text) # Remove special characters/emojis
    text = text.lower() # Convert to lowercase
    return text

In [12]:
df['Clean_Tweet'] = df['Tweet'].apply(clean_tweet)

In [13]:
df.dropna(subset=['Clean_Tweet'] , inplace=True) # Important : Drop not a number values
print("Tweets Cleaned")

Tweets Cleaned


## 

## STEP 3: labeling the Data(Heuristic/Rule-Based)

In [16]:
positive_keywords = [
    'hero', 'great', 'proud', 'massive support', 'game-changing', 
    'achievement', 'optimism', 'exciting', 'interesting',
    'awesome', 'amazing', 'fantastic', 'wonderful', 'brilliant', 'excellent',
    'impressive', 'outstanding', 'incredible', 'beautiful', 'love',
    'respect', 'support', 'well done', 'congrats', 'congratulations',
    'victory', 'win', 'successful', 'superb', 'legend', 'epic', 'cool',
    'positive', 'grateful', 'thankful', 'encouraging', 'celebrate', 
    'inspiring', 'inspiration', 'remarkable', 'top notch', 'thrilled',
    'great job', 'kudos', 'heartwarming', 'uplifting', 'blessed',
    'good vibes', 'happy', 'joy', 'delight', 'cheerful', 'excellent work',
    'well deserved', 'improvement', 'progress', 'milestone',
    'mast', 'badiya', 'zabardast', 'solid', 'kamaal', 'dil jeet' 'liya', 'shandar',
'respect mil gaya', 'pyaar', 'dhamaal', 'sahi hai',' proud moment'

]

negative_keywords = [
    'dead horse', 'blunt sword', 'fascism', 'venom', 'against', 'hiding',
    'sikh rhi h', 'sikh rhe h', 'fools', 'toaster',
    'hate', 'terrible', 'awful', 'worst', 'bad', 'disaster', 'fail', 'failure',
    'useless', 'nonsense', 'trash', 'pathetic', 'angry', 'annoying',
    'ridiculous', 'joke', 'mess', 'broken', 'problem', 'issue', 'disappointing',
    'weak', 'corrupt', 'shame', 'shameful', 'stupid', 'idiot', 'nasty',
    'negative', 'toxic', 'boring', 'lame', 'sucks', 'cringe', 'misery',
    'frustrating', 'horrible', 'tragic', 'dangerous', 'pain', 'suffering',
    'warning', 'complaint', 'attack', 'blame', 'exposed', 'failure', 'fraud',
    'lies', 'liar', 'broken', 'problematic','bakwas', 'bekaar', 'chutiya', 'ghatiya', 'faltu', 'chutiyapa', 'pagal', 'bewakoof',
'fraud', 'dhokha', 'khooti baat,'  , 'lafda', 'pareshan', 'beizzati', 'gussa'

]


In [17]:
def simple_labeler(text):
    text_lower = text.lower()
    
    if any(keyword in text_lower for keyword in negative_keywords):
        return -1 # means negative
    
    if any(keyword in text_lower for keyword in positive_keywords):
        return 1 # means positive
    
    return 0 # means neutral

df['Sentiment_Label'] = df['Clean_Tweet'].apply(simple_labeler)

print("\nSentiment Label Distribution (Target Variable):")
print(df['Sentiment_Label'].value_counts())


Sentiment Label Distribution (Target Variable):
Sentiment_Label
 0    147942
 1     33782
-1     28276
Name: count, dtype: int64


## STEP 4: Feature Engineering (TF-IDF Vectorization)

In [19]:
# separate features(x) and target(y)

X = df['Clean_Tweet']
Y = df['Sentiment_Label']

# split data into training and testing sets(80% train and 20% test )
# stratify=y ensures the train/test split has the same ratio of sentiment labels

X_train , X_test , Y_train , Y_test = train_test_split(X, Y , test_size=0.2, random_state=42, stratify=Y)
# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8, min_df=1)

# Fit/Train the vectorizer on the training data and transform both sets
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"\nTraining data shape after TF-IDF: {X_train_tfidf.shape}")
print(f"Testing data shape after TF-IDF: {X_test_tfidf.shape}")



Training data shape after TF-IDF: (168000, 128439)
Testing data shape after TF-IDF: (42000, 128439)


In [26]:
# model training and eveluation
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import confusion_matrix , classification_report ,accuracy_score 
import numpy as np 

logreg_model = LogisticRegression(
    solver='liblinear', 
    random_state=42, 
    multi_class='ovr', 
    max_iter=1000 # Increased max_iter for convergence
)

print("\n Starting model training...")
logreg_model.fit(X_train_tfidf , Y_train )
print("Model training complete.")

# make predictions on test-set
Y_pred = logreg_model.predict(X_test_tfidf)

#model evalution
print("\n model evaluation results")

overall_accuracy = accuracy_score(Y_test , Y_pred)
print(f"Overall Accuracy: {overall_accuracy:.4f}")

 #  Classification Report (Detailed performance per class)

print("\n Classification Report (Precision, Recall, F1-Score) ")
print(classification_report(Y_test, Y_pred, target_names=['Negative (-1)', 'Neutral (0)', 'Positive (1)']))


#  Confusion Matrix
# it will show -->  how many instances were correctly and incorrectly classified for each class
conf_matrix = confusion_matrix(Y_test, Y_pred)

print("\n Confusion Matrix (Actual vs. Predicted) ")

# Labels: Actual on rows, Predicted on columns
# [[TN, FP_to_0, FP_to_1]
#  [FN_to_-1, TN_0, FP_to_1]
#  [FN_to_-1, FN_to_0, TP]]
print(" \n Predicted: -1    0    1 \n ")
print(f"Actual -1: {conf_matrix[0]}")
print(f"Actual 0:  {conf_matrix[1]}")
print(f"Actual 1:  {conf_matrix[2]}")




 Starting model training...




Model training complete.

 model evaluation results
Overall Accuracy: 0.9407

 Classification Report (Precision, Recall, F1-Score) 
               precision    recall  f1-score   support

Negative (-1)       0.99      0.72      0.83      5655
  Neutral (0)       0.93      1.00      0.96     29589
 Positive (1)       0.97      0.87      0.92      6756

     accuracy                           0.94     42000
    macro avg       0.96      0.86      0.90     42000
 weighted avg       0.94      0.94      0.94     42000


 Confusion Matrix (Actual vs. Predicted) 
 
 Predicted: -1    0    1 
 
Actual -1: [4077 1384  194]
Actual 0:  [   24 29564     1]
Actual 1:  [  10  879 5867]
