In [4]:
# Import necessary libraries

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [5]:
# Read data from CSV files
modi_data = pd.read_csv('preprocessed_modi_data.csv')
rahul_data = pd.read_csv('preprocessed_rahul_data.csv')

In [6]:
# Explore the data
print(modi_data.shape)
print(rahul_data.shape)

(25683, 2)
(14261, 2)


# Prediction about Election with the help of Logistic Regression

In [7]:
# Combine data for both candidates
modi_data['Label'] = 1  # Modi as class 1
rahul_data['Label'] = 0  # Rahul as class 0
combined_data = pd.concat([modi_data, rahul_data])

# Feature extraction using CountVectorizer
vectorizer = CountVectorizer(max_features=1000)
X = vectorizer.fit_transform(combined_data['Tweet']).toarray()
y = combined_data['Label']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions and calculate accuracy
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Prediction accuracy with Logistic Regression: {accuracy * 100:.2f}%")

# Predict winner based on average sentiment
modi_prediction = model.predict(vectorizer.transform(modi_data['Tweet']).toarray()).mean()
rahul_prediction = model.predict(vectorizer.transform(rahul_data['Tweet']).toarray()).mean()

winner = "Modi" if modi_prediction > rahul_prediction else "Rahul"
print(f"The predicted winner is: {winner}")


Prediction accuracy with Logistic Regression: 87.97%
The predicted winner is: Modi
