
```
Student ID: 2357572
Student Name: Suraj Kanwar
```



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Helper Function for Text Cleaning:

Implement a Helper Function as per Text Preprocessing Notebook and Complete the following pipeline.

In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [5]:
# Download required NLTK resources
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Build a Text Cleaning Pipeline

In [6]:
# Text Cleaning Pipeline
def text_cleaning_pipeline(dataset, rule="lemmatize"):
    """
    Preprocesses a single text string by cleaning, tokenizing, removing stopwords,
    and applying lemmatization or stemming.

    Args:
        dataset (str): Input text to be cleaned.
        rule (str): Either 'lemmatize' or 'stem' to choose the normalization method.

    Returns:
        str: Cleaned and processed text as a single string.
    """
    if not isinstance(dataset, str):
        return ""

    # Convert to lowercase
    data = dataset.lower()

    # Remove URLs
    data = re.sub(r'https?://\S+|www\.\S+', '', data)

    # Remove emojis
    data = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U00002702-\U000027B0\U000024C2-\U0001F251]+', '', data)

    # Remove unwanted characters (keep alphanumeric, spaces, and basic punctuation)
    data = re.sub(r'[^a-z0-9\s.,!?]', '', data)

    # Tokenize
    tokens = word_tokenize(data)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Apply lemmatization or stemming
    if rule == "lemmatize":
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    elif rule == "stem":
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]
    else:
        print("Pick between lemmatize or stem")
        return ""

    return " ".join(tokens)

# Text Classification using Machine Learning Models


### 📝 Instructions: Trump Tweet Sentiment Classification

1. **Load the Dataset**  
   Load the dataset named `"trump_tweet_sentiment_analysis.csv"` using `pandas`. Ensure the dataset contains at least two columns: `"text"` and `"label"`.

2. **Text Cleaning and Tokenization**  
   Apply a text preprocessing pipeline to the `"text"` column. This should include:
   - Lowercasing the text  
   - Removing URLs, mentions, punctuation, and special characters  
   - Removing stopwords  
   - Tokenization (optional: stemming or lemmatization)
   - "Complete the above function"

3. **Train-Test Split**  
   Split the cleaned and tokenized dataset into **training** and **testing** sets using `train_test_split` from `sklearn.model_selection`.

4. **TF-IDF Vectorization**  
   Import and use the `TfidfVectorizer` from `sklearn.feature_extraction.text` to transform the training and testing texts into numerical feature vectors.

5. **Model Training and Evaluation**  
   Import **Logistic Regression** (or any machine learning model of your choice) from `sklearn.linear_model`. Train it on the TF-IDF-embedded training data, then evaluate it using the test set.  
   - Print the **classification report** using `classification_report` from `sklearn.metrics`.


In [7]:
# Step 1: Load the dataset
data = pd.read_csv('/content/drive/MyDrive/Artificial Intelligence and Machine Learning/Week:8/trum_tweet_sentiment_analysis.csv')

# Verify the columns
print(data.columns)
print(data.head())

Index(['text', 'Sentiment'], dtype='object')
                                                text  Sentiment
0  RT @JohnLeguizamo: #trump not draining swamp b...          0
1  ICYMI: Hackers Rig FM Radio Stations To Play A...          0
2  Trump protests: LGBTQ rally in New York https:...          1
3  "Hi I'm Piers Morgan. David Beckham is awful b...          0
4  RT @GlennFranco68: Tech Firm Suing BuzzFeed fo...          0


In [8]:
# Step 2: Text Cleaning and Tokenization
# Drop rows with missing 'text' values
data = data.dropna(subset=['text'])

# Apply the text cleaning pipeline
data['cleaned_text'] = data['text'].apply(lambda x: text_cleaning_pipeline(x, rule="lemmatize"))

# Verify the cleaned text
print(data[['text', 'cleaned_text']].head())

                                                text  \
0  RT @JohnLeguizamo: #trump not draining swamp b...   
1  ICYMI: Hackers Rig FM Radio Stations To Play A...   
2  Trump protests: LGBTQ rally in New York https:...   
3  "Hi I'm Piers Morgan. David Beckham is awful b...   
4  RT @GlennFranco68: Tech Firm Suing BuzzFeed fo...   

                                        cleaned_text  
0  rt johnleguizamo trump draining swamp taxpayer...  
1  icymi hacker rig fm radio station play antitru...  
2  trump protest lgbtq rally new york bbcworld vi...  
3  hi im pier morgan . david beckham awful donald...  
4  rt glennfranco68 tech firm suing buzzfeed publ...  


In [9]:
# Step 3: Train-Test Split
# Define features (cleaned text) and labels (correct column name: 'Sentiment')
X = data['cleaned_text']
y = data['Sentiment']  # Corrected from 'label' to 'Sentiment'

# Split the data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify the split
print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

Training set size: 1480098
Testing set size: 370025


In [10]:
# Step 4: TF-IDF Vectorization
# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit and transform the training data
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_tfidf = vectorizer.transform(X_test)

# Verify the shape of the TF-IDF matrices
print(f"Training TF-IDF shape: {X_train_tfidf.shape}")
print(f"Testing TF-IDF shape: {X_test_tfidf.shape}")

Training TF-IDF shape: (1480098, 5000)
Testing TF-IDF shape: (370025, 5000)


In [11]:
# Step 5: Model Training and Evaluation
# Initialize the Logistic Regression model
model = LogisticRegression(random_state=42)

# Train the model
model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.95      0.94    248563
           1       0.90      0.85      0.87    121462

    accuracy                           0.92    370025
   macro avg       0.91      0.90      0.90    370025
weighted avg       0.92      0.92      0.92    370025

