In [None]:
import pandas as pd
import numpy as np
data = pd.read_csv("/content/trum_tweet_sentiment_analysis.csv")
data.head()

Unnamed: 0,text,Sentiment
0,RT @JohnLeguizamo: #trump not draining swamp b...,0.0
1,ICYMI: Hackers Rig FM Radio Stations To Play A...,0.0
2,Trump protests: LGBTQ rally in New York https:...,1.0
3,"""Hi I'm Piers Morgan. David Beckham is awful b...",0.0
4,RT @GlennFranco68: Tech Firm Suing BuzzFeed fo...,0.0


## Helper Function for Text Cleaning:

Implement a Helper Function as per Text Preprocessing Notebook and Complete the following pipeline.

# Build a Text Cleaning Pipeline

In [None]:

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re

def text_cleaning_pipeline(dataset, rule="lemmatize"):
  """
  This function takes a text dataset as input and performs a series of cleaning and normalization steps.
  It removes URLs, emojis, unwanted characters, stopwords and then either lemmatizes or stems the remaining tokens.

  Input Args:
    dataset: A string containing the text data to be cleaned.
    rule: A string indicating the normalization rule to be applied, either 'lemmatize' or 'stem'.
          Defaults to 'lemmatize'.

  Returns:
    A string containing the cleaned and normalized text.
  """
  # 1. Convert the input to lowercase.
  data = dataset.lower()

  # 2. Remove URLs
  data = re.sub(r'https?://\S+|www\.\S+', '', data)

  # 3. Remove emojis
  data = re.sub(r"["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", r' ', data, flags=re.UNICODE)

  # 4. Remove all other unwanted characters.
  data = re.sub(r"[^a-zA-Z0-9 ]", '', data) # Keeping only alphanumeric characters and spaces

  # 5. Create tokens.
  tokens = data.split()

  # 6. Remove stopwords:
  stop_words = set(stopwords.words('english'))
  tokens = [token for token in tokens if token not in stop_words]

  # 7. Lemmatization or Stemming:
  if rule == "lemmatize":
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
  elif rule == "stem":
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
  else:
    print("Pick between lemmatize or stem")

  return " ".join(tokens) # Join the tokens back into a string




# Apply the cleaning pipeline to the 'text' column
data['cleaned_text'] = data['text'].apply(lambda x: text_cleaning_pipeline(x))

# Visualize some samples
for i in range(10):  # Display 10 samples
    print(f"Original: {data['text'][i]}")
    print(f"Cleaned: {data['cleaned_text'][i]}\n")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Original: RT @JohnLeguizamo: #trump not draining swamp but our taxpayer dollars on his trips to advertise his properties! @realDonaldTrump https://t.co/gFBvUkMX9z
Cleaned: rt johnleguizamo trump draining swamp taxpayer dollar trip advertise property realdonaldtrump

Original: ICYMI: Hackers Rig FM Radio Stations To Play Anti-Trump Song https://t.co/fV1J4HbXAt https://t.co/7kwDnuBUUd
Cleaned: icymi hacker rig fm radio station play antitrump song

Original: Trump protests: LGBTQ rally in New York https://t.co/LfHRD9Ft5I by #BBCWorld via @c0nvey
Cleaned: trump protest lgbtq rally new york bbcworld via c0nvey

Original: "Hi I'm Piers Morgan. David Beckham is awful but Donald Trump is ok."
Cleaned: hi im pier morgan david beckham awful donald trump ok

Original: RT @GlennFranco68: Tech Firm Suing BuzzFeed for Publishing Unverified Trump Dossier  https://t.co/YvkaKaBdrJ https://t.co/1KLQSJfaKg
Cleaned: rt glennfranco68 tech firm suing buzzfeed publishing unverified trump dossier

Original: 

# Text Classification using Machine Learning Models


### 📝 Instructions: Trump Tweet Sentiment Classification

1. **Load the Dataset**  
   Load the dataset named `"trump_tweet_sentiment_analysis.csv"` using `pandas`. Ensure the dataset contains at least two columns: `"text"` and `"label"`.

2. **Text Cleaning and Tokenization**  
   Apply a text preprocessing pipeline to the `"text"` column. This should include:
   - Lowercasing the text  
   - Removing URLs, mentions, punctuation, and special characters  
   - Removing stopwords  
   - Tokenization (optional: stemming or lemmatization)
   - "Complete the above function"

3. **Train-Test Split**  
   Split the cleaned and tokenized dataset into **training** and **testing** sets using `train_test_split` from `sklearn.model_selection`.

4. **TF-IDF Vectorization**  
   Import and use the `TfidfVectorizer` from `sklearn.feature_extraction.text` to transform the training and testing texts into numerical feature vectors.

5. **Model Training and Evaluation**  
   Import **Logistic Regression** (or any machine learning model of your choice) from `sklearn.linear_model`. Train it on the TF-IDF-embedded training data, then evaluate it using the test set.  
   - Print the **classification report** using `classification_report` from `sklearn.metrics`.


# Train Test Split

In [None]:
# 1. Clean the data (ensure no NaN in 'text' either)
data = data.dropna(subset=['text', 'Sentiment'])

# 2. Apply text cleaning pipeline
data['cleaned_text'] = data['text'].apply(text_cleaning_pipeline)

# 3. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    data['cleaned_text'],
    data['Sentiment'],
    test_size=0.2,
    random_state=42,

    stratify=data['Sentiment']
)

# 4. TF-IDF Vectorization -
# converts plain text into numerical values
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# The goal is to classify text into two categories: positive or negative sentiment.
# This is a binary classification problem. Logistic regression is specifically designed for such scenarios.

# 5. Train and evaluatep
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)

print(classification_report(y_test, y_pred))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['cleaned_text'] = data['text'].apply(text_cleaning_pipeline)


              precision    recall  f1-score   support

         0.0       0.91      0.94      0.93    240206
         1.0       0.87      0.81      0.84    116524

    accuracy                           0.90    356730
   macro avg       0.89      0.88      0.88    356730
weighted avg       0.90      0.90      0.90    356730

