In [120]:
#Imports Libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [121]:
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [122]:
# Dataset Load
df = pd.read_csv("Twitter_data.csv")

In [123]:
#print columns
print(df.head())

                                               tweet  label
0  when modi promised “minimum government maximum...   -1.0
1  talk all the nonsense and continue all the dra...    0.0
2  what did just say vote for modi  welcome bjp t...    1.0
3  asking his supporters prefix chowkidar their n...    1.0
4  answer who among these the most powerful world...    1.0


In [124]:
#print columns name
print(df.columns)

Index(['tweet', 'label'], dtype='object')


In [125]:
print(df['label'].unique())
print(type(df['label']))

[-1.  0.  1. nan]
<class 'pandas.core.series.Series'>


In [126]:
#Change labels -1, 1 into 0, 1
df['label'] = df['label'].replace({-1.0:0, 1.0:1})

In [127]:
print(df['label'].isna().sum())

7


In [128]:
df = df.dropna(subset=['label'])

In [130]:
# 3) Cleaning Function
def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    text = re.sub(r'http\S+', '', text)      # remove links
    text = re.sub(r'@\w+', '', text)         # remove mentions
    text = re.sub(r'#\w+', '', text)         # remove hashtags
    text = re.sub(r'[^A-Za-z\s]', ' ', text) # keep letters only
    text = text.lower().strip()
    sw = set(stopwords.words('english'))
    words = [w for w in text.split() if w not in sw]
    return " ".join(words)



In [131]:
#Apply Cleaning
df['clean_tweet'] = df['tweet'].apply(clean_text)

In [132]:
#Remove blank rows
df = df[df['clean_tweet'].str.strip().astype(bool)]

In [133]:
print("Sample Data:\n", df.head())

Sample Data:
                                                tweet  label  \
0  when modi promised “minimum government maximum...    0.0   
1  talk all the nonsense and continue all the dra...    0.0   
2  what did just say vote for modi  welcome bjp t...    1.0   
3  asking his supporters prefix chowkidar their n...    1.0   
4  answer who among these the most powerful world...    1.0   

                                         clean_tweet  
0  modi promised minimum government maximum gover...  
1             talk nonsense continue drama vote modi  
2  say vote modi welcome bjp told rahul main camp...  
3  asking supporters prefix chowkidar names modi ...  
4  answer among powerful world leader today trump...  


In [134]:
#Features + Labels
X = df['clean_tweet']
y = df['label']

In [None]:
#Train-Test Split
X_train_text, X_test_text, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [136]:
#Vectorization
cv = CountVectorizer(stop_words='english')
x_train_vec = cv.fit_transform(X_train_text)
x_test_vec = cv.transform(X_test_text)

In [137]:
#Model Training
model = LogisticRegression(max_iter=1000)
model.fit(x_train_vec, y_train)

In [138]:
#Predictions
y_pred = model.predict(x_test_vec)

In [139]:
#check accuracy and Classification Report
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8901746523834372

Classification Report:
               precision    recall  f1-score   support

         0.0       0.88      0.93      0.90     18123
         1.0       0.91      0.84      0.87     14456

    accuracy                           0.89     32579
   macro avg       0.89      0.88      0.89     32579
weighted avg       0.89      0.89      0.89     32579



In [140]:
#convert 1, 0 to possitive and negative
predicted_labels = ["Possitive" if val == 1 else "Negative" for val in y_pred]

In [141]:
#output save in csv file
output = pd.DataFrame({
    "tweet": X_test_text.values,
    "predicted_labels": predicted_labels
})

output.to_csv("Output.csv", index=False)