## Step 1: Import necessary libraries


In [22]:
import numpy as np
import pandas as pd
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Downloads (ensure these are run before using lemmatizer/stopwords)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score, classification_report


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Revanasidda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Revanasidda\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Revanasidda\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Step 2: Load the Dataset


In [3]:
column_names = ['target', 'id', 'date', 'flag', 'user', 'text']
twitter_data = pd.read_csv('../twitter_data.csv', names=column_names, encoding = 'ISO-8859-1')
twitter_data.shape
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [23]:
twitter_data.isnull().sum()
twitter_data['target'].value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

## Step 3: Preprocess the Data

##### 4 -> Positive Tweet and 0 -> Negative Tweet

In [2]:
def lemmatization(content):
    lemmatizer = WordNetLemmatizer()
    content = re.sub('[^a-zA-Z]', ' ', content)  # Remove non-alphabetic characters
    content = content.lower()  # Convert to lowercase
    tokens = content.split()  # Tokenize
    stop_words = set(stopwords.words("english"))
    tokens = [token for token in tokens if token not in stop_words]  # Remove stop words
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Lemmatize
    return ' '.join(lemmatized_tokens)  # Return as a single string

In [None]:
twitter_data['lemmatized_content'] = twitter_data['text'].apply(lemmatization)

In [8]:
twitter_data = pd.read_csv('../twitter_data_cleaned.csv', encoding = 'ISO-8859-1')
twitter_data.shape
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text,lemmatized_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset update facebook texting might cry result...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dived many time ball managed save res...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole body feel itchy like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behaving mad see


In [9]:
twitter_data['lemmatized_content'].head()

0    switchfoot http twitpic com zl awww bummer sho...
1    upset update facebook texting might cry result...
2    kenichan dived many time ball managed save res...
3                      whole body feel itchy like fire
4                     nationwideclass behaving mad see
Name: lemmatized_content, dtype: object

In [10]:
twitter_data['target'].head()

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64

## Step 4: Define the Models

We'll use:
- `TfidfVectorizer` for text feature extraction.
- `MultinomialNB` for Naive Bayes classification.
- `LogisticRegression` for LR.
- `VotingClassifier` to combine both using soft voting.


In [13]:
x = twitter_data['lemmatized_content']
y = twitter_data['target'].astype(str)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print(x_train.shape, x_test.shape)

(1280000,) (320000,)


In [14]:
x_train = x_train.fillna("")
x_test = x_test.fillna("")

In [15]:
# TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Individual models
nb_model = MultinomialNB()
lr_model = LogisticRegression(max_iter=200, random_state=42)

# VotingClassifier with soft voting (probability average)
ensemble_model = VotingClassifier(
    estimators=[
        ('nb', nb_model),
        ('lr', lr_model)
    ],
    voting='soft'
)


##  Step 5: Create a Pipeline

Building a pipeline to:
1. Convert text to TF-IDF vectors.
2. Apply the VotingClassifier.


In [19]:
# Create the pipeline
model_pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('ensemble', ensemble_model)
])


##  Step 6: Train the Model


In [20]:
model_pipeline.fit(x_train, y_train)

##  Step 7: Evaluate the Model

In [23]:
# Predict on test set
y_pred = model_pipeline.predict(x_test)

# Print evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.766459375

Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.75      0.76    159494
           4       0.76      0.78      0.77    160506

    accuracy                           0.77    320000
   macro avg       0.77      0.77      0.77    320000
weighted avg       0.77      0.77      0.77    320000



### Save the trained pipeline model

In [24]:
import joblib

joblib.dump(model_pipeline, "model_pipeline.joblib")
print("Model saved as 'model_pipeline.joblib'")


Model saved as 'model_pipeline.joblib'
