In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import zipfile
import os

# Unzipping directly to Colab temporary storage (faster access)

!unzip "/content/drive/MyDrive/trum_tweet_sentiment_analysis (1).zip" -d "/content"

Archive:  /content/drive/MyDrive/trum_tweet_sentiment_analysis (1).zip
  inflating: /content/trum_tweet_sentiment_analysis (1).csv  


In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


# Assuming the file is in your Drive and you know the path
file_path = '/content/drive/MyDrive/trum_tweet_sentiment_analysis (1).zip'  # Update this path
df = pd.read_csv(file_path)


# Display the first few rows to verify
print(df.head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                                text  Sentiment
0  RT @JohnLeguizamo: #trump not draining swamp b...          0
1  ICYMI: Hackers Rig FM Radio Stations To Play A...          0
2  Trump protests: LGBTQ rally in New York https:...          1
3  "Hi I'm Piers Morgan. David Beckham is awful b...          0
4  RT @GlennFranco68: Tech Firm Suing BuzzFeed fo...          0


In [None]:
# Load the Dataset
# Check columns and data
print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())
print("\nColumn names:", df.columns)
print("\nSentiment distribution:")  # Changed label to Sentiment
print(df['Sentiment'].value_counts())  # Changed label to Sentiment

# For later in your code where you split data:
X = df['text']          # Features (text column)
y = df['Sentiment']      # Labels (now using correct Sentiment column)

Dataset shape: (1850123, 2)

First 5 rows:
                                                text  Sentiment
0  RT @JohnLeguizamo: #trump not draining swamp b...          0
1  ICYMI: Hackers Rig FM Radio Stations To Play A...          0
2  Trump protests: LGBTQ rally in New York https:...          1
3  "Hi I'm Piers Morgan. David Beckham is awful b...          0
4  RT @GlennFranco68: Tech Firm Suing BuzzFeed fo...          0

Column names: Index(['text', 'Sentiment'], dtype='object')

Sentiment distribution:
Sentiment
0    1244211
1     605912
Name: count, dtype: int64


In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# First: Download ALL required NLTK data in GPU runtime
nltk.download('all', quiet=True)  # This will download all NLTK resources

# Then download specific resources we need
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt_tab', quiet=True)  # Specifically for Punkt tokenizer tables

def preprocess_text(text, lemmatize=True):
    """
    Text preprocessing pipeline:
    - Lowercase
    - Remove URLs, mentions, punctuation
    - Remove stopwords
    - Tokenization & Lemmatization
    """
    try:
        # Lowercase
        text = str(text).lower()

        # Remove URLs, mentions (@user), hashtags
        text = re.sub(r'http\S+|www\S+|https\S+|@\w+|#\w+', '', text)

        # Remove punctuation & special chars
        text = re.sub(r'[^\w\s]', ' ', text)

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        # Tokenize
        tokens = word_tokenize(text)

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]

        # Lemmatization
        if lemmatize and tokens:
            lemmatizer = WordNetLemmatizer()
            tokens = [lemmatizer.lemmatize(word) for word in tokens]

        # Remove short words
        tokens = [word for word in tokens if len(word) > 1]

        return ' '.join(tokens)
    except Exception as e:
        print(f"Error processing text: {str(e)[:100]}...")  # Print first 100 chars of error
        return ""

# Apply preprocessing (without progress bar for simplicity)
print("Starting text preprocessing...")
df['cleaned_text'] = df['text'].apply(preprocess_text)

print("\nSample cleaned text:")
print(df[['text', 'cleaned_text']].head())

Starting text preprocessing...

Sample cleaned text:
                                                text  \
0  RT @JohnLeguizamo: #trump not draining swamp b...   
1  ICYMI: Hackers Rig FM Radio Stations To Play A...   
2  Trump protests: LGBTQ rally in New York https:...   
3  "Hi I'm Piers Morgan. David Beckham is awful b...   
4  RT @GlennFranco68: Tech Firm Suing BuzzFeed fo...   

                                        cleaned_text  
0  rt draining swamp taxpayer dollar trip adverti...  
1  icymi hacker rig fm radio station play anti tr...  
2             trump protest lgbtq rally new york via  
3  hi pier morgan david beckham awful donald trum...  
4  rt tech firm suing buzzfeed publishing unverif...  


Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

# Use the correct column name ('Sentiment' instead of 'label')
X = df['cleaned_text']  # Features (processed text)
y = df['Sentiment']     # Labels (using correct column name)

# Split into 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nTrain size:", len(X_train))
print("Test size:", len(X_test))
print("\nClass distribution in training set:")
print(y_train.value_counts())


Train size: 1480098
Test size: 370025

Class distribution in training set:
Sentiment
0    995648
1    484450
Name: count, dtype: int64


TF-IDF Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)  # Limit to top 5000 features

# Fit on training data & transform
X_train_tfidf = tfidf.fit_transform(X_train)

# Transform test data (don't fit again!)
X_test_tfidf = tfidf.transform(X_test)

print("\nTF-IDF shape (Train):", X_train_tfidf.shape)
print("TF-IDF shape (Test):", X_test_tfidf.shape)


TF-IDF shape (Train): (1480098, 5000)
TF-IDF shape (Test): (370025, 5000)


Model Training & Evaluation (Logistic Regression)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Initialize & train model
model = LogisticRegression(max_iter=1000)  # Increase max_iter for convergence
model.fit(X_train_tfidf, y_train)

# Predictions
y_pred = model.predict(X_test_tfidf)

# Evaluation
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.95      0.94    248563
           1       0.90      0.86      0.88    121462

    accuracy                           0.92    370025
   macro avg       0.91      0.90      0.91    370025
weighted avg       0.92      0.92      0.92    370025



Save & Load Model

In [None]:
import joblib

# Save model & TF-IDF vectorizer
joblib.dump(model, 'trump_sentiment_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

# Load later (for deployment)
# model = joblib.load('trump_sentiment_model.pkl')
# tfidf = joblib.load('tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']