In [17]:
import pandas as pd 
import re 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import io

In [18]:
# step1 : load the raw data 

df_modi = pd.read_csv("../data/raw/Narendra Modi_data.csv")
df_rahul = pd.read_csv("../data/raw/Rahul Gandhi_data.csv")
df_kejriwal = pd.read_csv("../data/raw/Arvind Kejriwal_data.csv")

In [19]:
print(modi_df.head())
print(rahul_df.head())
print(kejri_df.head())

         Date             User  \
0  2022:10:19   QuestionsBotYT   
1  2022:10:19       PaperDabba   
2  2022:10:19   mnjworldcom123   
3  2022:10:19  BravePedestrian   
4  2022:10:19   NaMoPraveenKor   

                                               Tweet      Time  
0                        Is Narendra Modi a toaster?  23:57:08  
1  5G About To Bring Major Change, Will Revolutio...  23:56:38  
2  Prime Minister Shri Narendra Modi along with H...  23:51:02  
3  Bharat Mata has waited 5000 years for a true s...  23:40:58  
4  How Narendra Modi’s game-changing Gati Shakti ...  23:34:25  
         Date             User  \
0  2022:10:19          MdIjran   
1  2022:10:19  28bde43dae3c430   
2  2022:10:19         SkAnzar5   
3  2022:10:19    HariRamDamor2   
4  2022:10:19     srinivas_das   

                                               Tweet      Time  
0  @JaikyYadav16 इन विकल्पों में से और अभी के समय...  23:55:49  
1  @ndtv Rahul Gandhi left congress in the mid ro...  23:53:30  
2    

In [20]:
# combine dataframes
df = pd.concat([df_modi, df_rahul, df_kejriwal], ignore_index=True)
print("Combined Data Loaded. Total rows:", len(df))

Combined Data Loaded. Total rows: 210000


In [21]:
# STEP -2 initial preprocessing (cleaning the text)

def clean_tweet(text):
    text= str(text) # to make sure text is string 
    text=re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # Remove URLs
    text = re.sub(r'@\w+', '', text) # Remove mentions (@anurag)
    text = re.sub(r'RT[\s]+', '', text) # Remove Retweet tags
    text = re.sub(r'[^\w\s#]', '', text) # Remove special characters/emojis
    text = text.lower() # Convert to lowercase
    return text

In [22]:
df['Clean_Tweet'] = df['Tweet'].apply(clean_tweet)

In [23]:
df.dropna(subset=['Clean_Tweet'] , inplace=True) # Important : Drop not a number values
print("Tweets Cleaned")

Tweets Cleaned


## 

## STEP 3: labeling the Data(Heuristic/Rule-Based)

In [24]:
positive_keywords = ['hero', 'great', 'proud', 'massive support', 'game-changing', 'achievement', 'optimism', 'exciting', 'interesting']
negative_keywords = ['dead horse', 'blunt sword', 'fascism', 'venom', 'against', 'hiding', 'sikh rhi h', 'sikh rhe h', 'fools', 'toaster']

In [25]:
positive_keywords = ['hero', 'great', 'proud', 'massive support', 'game-changing', 'achievement', 'optimism', 'exciting', 'interesting']
negative_keywords = ['dead horse', 'blunt sword', 'fascism', 'venom', 'against', 'hiding', 'sikh rhi h', 'sikh rhe h', 'fools', 'toaster']

def simple_labeler(text):
    text_lower = text.lower()
    
    if any(keyword in text_lower for keyword in negative_keywords):
        return -1 # means negative
    
    if any(keyword in text_lower for keyword in positive_keywords):
        return 1 # means positive
    
    return 0 # means neutral

df['Sentiment_Label'] = df['Clean_Tweet'].apply(simple_labeler)

print("\nSentiment Label Distribution (Target Variable):")
print(df['Sentiment_Label'].value_counts())


Sentiment Label Distribution (Target Variable):
Sentiment_Label
 0    199231
 1      6681
-1      4088
Name: count, dtype: int64


## STEP 4: Feature Engineering (TF-IDF Vectorization)

In [28]:
# separate features(x) and target(y)

X = df['Clean_Tweet']
Y = df['Sentiment_Label']

# split data into training and testing sets(80% train and 20% test )
# stratify=y ensures the train/test split has the same ratio of sentiment labels

X_train , X_test , Y_train , Y_test = train_test_split(X, Y , test_size=0.2, random_state=42, stratify=Y)
# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8, min_df=1)

# Fit/Train the vectorizer on the training data and transform both sets
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"\nTraining data shape after TF-IDF: {X_train_tfidf.shape}")
print(f"Testing data shape after TF-IDF: {X_test_tfidf.shape}")



Training data shape after TF-IDF: (168000, 128334)
Testing data shape after TF-IDF: (42000, 128334)
