In [1]:
# Import necessary libraries

import re
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from textblob import TextBlob
import plotly.graph_objects as go
import plotly.express as ex
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
# Read data from CSV files
modi_data = pd.read_csv('modi_reviews.csv')
rahul_data = pd.read_csv('rahul_reviews.csv')

In [4]:
# Explore the Modi data
modi_data.shape

(25688, 3)

In [5]:
modi_data.shape

(25688, 3)

In [6]:
# Explore the Rahul data
rahul_data.shape

(14261, 3)

In [7]:
rahul_data.sample(10)

Unnamed: 0.1,Unnamed: 0,User,Tweet
3902,3902,VoiceofSAGA,@htTweets @sardesairajdeep #BJP is not as #Mod...
6178,6178,Oneindia,#Breaking | @KirtiAzadMP joins Congress party ...
9101,9101,RafaTweets_,@boxervijender @rachitseth @INCIndia @RahulGan...
7436,7436,Honnali_BJP,2019 #LokSabha election is getting near... My ...
993,993,sujit2809,#DMBChuneKaamdar Very Important stage of these...
7669,7669,dharmik18,It's laughable how those BJP ministers and bha...
1697,1697,Scorpion1007,@CNNnews18 @RahulGandhi @INCIndia @Zakka_Jacob...
7522,7522,TriptiRTiwari,@RahulGandhi Sir you are an unadulterated Liar...
13247,13247,ajaychat555,When #RahulGandhi is launching first ever mobi...
10459,10459,TusharVTari,@sanjaynirupam @RahulGandhi Sirjee your electi...


In [8]:
modi_data['Tweet'][10:20]

Unnamed: 0,Tweet
10,#BJP was renamed as Safroon Party then again i...
11,All Pakistanis had serious doubts on concept o...
12,"@ajaymaken @RahulGandhi And as a final touch, ..."
13,I think before casting the vote for last phas...
14,Trying to forge opposition will be a futile ex...
15,#LokSabhaElections2019 Anyone not having mass ...
16,#ElectionCommission #LokSabhaElections2019 PM'...
17,@INCIndia should release a video of @RahulGand...
18,@KajalChauhan_ @dashingassu Our PM ( Modi ji) ...
19,Howmany Seats will BJP win ?\n\n#LokSabhaElect...


In [9]:
# Define a function for text preprocessing
def preprocess_modi_text_data(data, text_column_name):
    # Lowercasing
    data[text_column_name] = data[text_column_name].str.lower()

    # Check and handle missing values
    print(f"Data type of '{text_column_name}' column before handling missing values: {data[text_column_name].dtype}")
    print(f"Number of missing values in '{text_column_name}' column before handling missing values: {data[text_column_name].isnull().sum()}")
    data = data.dropna(subset=[text_column_name])

    # Convert the column to strings
    data.loc[:, text_column_name] = data[text_column_name].astype(str)

    # Remove special characters, emojis, and emoticons
    data.loc[:, text_column_name] = data[text_column_name].apply(lambda x: re.sub(r'[^a-zA-Z\s😀-🙁]', '', x))

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    data.loc[:, text_column_name] = data[text_column_name].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))

    # Stemming
    stemmer = PorterStemmer()
    data.loc[:, text_column_name] = data[text_column_name].apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x)]))

    # Remove numbers
    data.loc[:, text_column_name] = data[text_column_name].apply(lambda x: re.sub(r'\d+', '', x))

    return data

# Apply text preprocessing to Modi data
modi_data = preprocess_modi_text_data(modi_data, 'Tweet')
modi_data = modi_data.drop('Unnamed: 0', axis=1)

# Explore the cleaned Modi data
modi_data.head()

Data type of 'Tweet' column before handling missing values: object
Number of missing values in 'Tweet' column before handling missing values: 5


Unnamed: 0,User,Tweet
0,advosushildixit,anjanaomkashyap see futur bjp spokesperson goo...
1,jiaeur,loksabhaelect rd may reveal even ecisveep coul...
2,PVenkatGandhi,loksabhaelect rd may reveal even ecisveep coul...
3,TheNirbhay1,pm modi creat new record pm democrat countri c...
4,ShakeChilli,abhijitmajumd appoint successor god forbid all...


In [10]:
rahul_data['Tweet'][5:15]

Unnamed: 0,Tweet
5,@INCIndia should release a video of @RahulGand...
6,@thakkar_sameet @BDUTT @surjitbhalla 4/n smack...
7,"Rahul Gandhi Spotted in Switzerland, 1 hour ag..."
8,BJP will be nowhere in India after May 23. BJP...
9,#Punjab \nBhatinda- cong- Sad neck to neck \n...
10,@IYCTelangana @IncBasavakalyan @narendramodi @...
11,@ani_digital @ANI @rssurjewala IF #ECI #SunilA...
12,My Predictions\n\nBJP+NDA = 300 to 310\nCongre...
13,"Siddaramaiah, Mallikarjun Kharge, Dinesh Gundu..."
14,@RahulGandhi RG Next PM of india #LokSabhaElec...


In [11]:
# Apply text preprocessing to Rahul data
def preprocess_rahul_text_data(data, text_column_name):
    # Lowercasing
    data[text_column_name] = data[text_column_name].str.lower()

    # Check and handle missing values
    print(f"Data type of '{text_column_name}' column before handling missing values: {data[text_column_name].dtype}")
    print(f"Number of missing values in '{text_column_name}' column before handling missing values: {data[text_column_name].isnull().sum()}")
    data = data.dropna(subset=[text_column_name])

    # Convert the column to strings
    data.loc[:, text_column_name] = data[text_column_name].astype(str)

    # Remove special characters, emojis, and emoticons
    data.loc[:, text_column_name] = data[text_column_name].apply(lambda x: re.sub(r'[^a-zA-Z\s😀-🙁]', '', x))

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    data.loc[:, text_column_name] = data[text_column_name].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))

    # Stemming
    stemmer = PorterStemmer()
    data.loc[:, text_column_name] = data[text_column_name].apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x)]))

    # Remove numbers
    data.loc[:, text_column_name] = data[text_column_name].apply(lambda x: re.sub(r'\d+', '', x))

    return data

rahul_data = preprocess_rahul_text_data(rahul_data, 'Tweet')
rahul_data = rahul_data.drop('Unnamed: 0', axis=1)
rahul_data.head()

Data type of 'Tweet' column before handling missing values: object
Number of missing values in 'Tweet' column before handling missing values: 0


Unnamed: 0,User,Tweet
0,Sunnysweet16,wonder academ journalist ask inc india rahul g...
1,drnitinchaube,congrat chang australiavot scottmorrison sir w...
2,mrvivek07,peopel say govt ne year kya kiya uns pucho yea...
3,JosephPravinP,ajaymaken rahulgandhi final touch modi ji prov...
4,VandanaMegastar,loksabhaelect anyon mass back cant visionari p...


In [12]:
# Save modi_data to a CSV file
modi_data.to_csv('preprocessed_modi_data.csv', index=False)

# Save rahul_data to a CSV file
rahul_data.to_csv('preprocessed_rahul_data.csv', index=False)

print("Datasets saved as 'preprocessed_modi_data.csv' and 'preprocessed_rahul_data.csv'")

Datasets saved as 'preprocessed_modi_data.csv' and 'preprocessed_rahul_data.csv'
