In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("/content/trum_tweet_sentiment_analysis.csv",
                   encoding="ISO-8859-1",
                   on_bad_lines='skip', # Changed 'error_bad_lines' to 'on_bad_lines' and set to 'skip' to skip bad lines
                   engine="python") # you could also switch to the Python parsing engine
data.head()

Unnamed: 0,text,Sentiment
0,RT @JohnLeguizamo: #trump not draining swamp b...,0.0
1,ICYMI: Hackers Rig FM Radio Stations To Play A...,0.0
2,Trump protests: LGBTQ rally in New York https:...,1.0
3,"""Hi I'm Piers Morgan. David Beckham is awful b...",0.0
4,RT @GlennFranco68: Tech Firm Suing BuzzFeed fo...,0.0


In [3]:
df_text = data[['text']]

In [4]:
df_text.dropna()

Unnamed: 0,text
0,RT @JohnLeguizamo: #trump not draining swamp b...
1,ICYMI: Hackers Rig FM Radio Stations To Play A...
2,Trump protests: LGBTQ rally in New York https:...
3,"""Hi I'm Piers Morgan. David Beckham is awful b..."
4,RT @GlennFranco68: Tech Firm Suing BuzzFeed fo...
...,...
31100,"RT @randyprine: Um, No Trump couldn't get loan..."
31101,Hahaha...Republicans will have to start drinki...
31102,RT @strummerriot: London police escorting prot...
31103,RT @FactorsTalcott: TRUMP?: If The Wall Will T...


In [5]:
import re
def remove_urls(text):
  """
  This function will try to remove URL present in out dataset and replace it with space using regex library.
  Input Args:
  text: strings of text that may contain URLs.
  Output Args:
  text: URLs replaces with text
  """
  url_pattern = re.compile(r'https?://\S+|www\.\S+')
  return url_pattern.sub(r'', text)


In [6]:
def remove_emoji(string):
  """
  This function will replace the emoji in string with whitespace
  """
  emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
  return emoji_pattern.sub(r' ', string)

In [7]:
def removeunwanted_characters(document):
  """
  This function will remove all the unwanted characters from the input dataset.
  Input Args:
  documet: A text data to be cleaned.
  Return:
  A cleaned document.
  """
  # remove user mentions
  document = re.sub("@[A-Za-z0-9_]+"," ", document)
  # remove hashtags
  document = re.sub("#[A-Za-z0-9_]+","", document)
  # remove punctuation
  document = re.sub("[^0-9A-Za-z ]", "" , document)
  #remove emojis
  document = remove_emoji(document)
  # remove double spaces
  document = document.replace('  ',"")
  return document.strip()

In [8]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))
custom_stopwords = ['@', 'RT']
stop_words.update(custom_stopwords)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
from nltk.tokenize import RegexpTokenizer

from nltk.tokenize import RegexpTokenizer

def remove_punct(text):
  """
  This function removes the punctutations present in our text data.
  Input Args:
  text: text data.
  Returns:
  text: cleaned text.
  """
  tokenizer = RegexpTokenizer(r"\w+")
  lst=tokenizer.tokenize(' '.join(text))
  return lst

In [10]:

def remove_stopwords(text_tokens):
  """
  This function removes all the stopwords present in out text tokens.
  Input Args:
  text_tokens: tokenize input of our datasets.
  Returns:
  result_tokens: list of token without stopword.
  """

  result_tokens = []
  for token in text_tokens:
    if token not in stop_words:
       result_tokens.append(token)
  return result_tokens

In [11]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize,pos_tag
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

def lemmatization(token_text):
  """
  This function performs the lemmatization operations as explained above.
  Input Args:
  token_text: list of tokens.
  Returns:
  lemmatized_tokens: list of lemmatized tokens.
  """
  lemma_tokens = []
  wordnet = WordNetLemmatizer()
  lemmatized_tokens = [wordnet.lemmatize(token, pos = 'v') for token in token_text]

  return lemmatized_tokens




[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [12]:
from nltk.stem import PorterStemmer

def stemming(text):
  """
  This function performs stemming operations.
  Input Args:
  token_text: list of tokenize text.
  Returns:
  stemm_tokes: list of stemmed tokens.
  """
  porter = PorterStemmer()
  stemm_tokens = []
  for word in text:
    stemm_tokens.append(porter.stem(word))
  return stemm_tokens

In [13]:
def lower_order(text):
  """
  This function converts all the text in input text to lower order.
  Input Args:
  token_text : input text.
  Returns:
  small_order_text : text converted to small/lower order.
  """
  small_order_text = text.lower()
  return small_order_text

# Test:
sample_text = "This Is some Normalized TEXT"
sample_small = lower_order(sample_text)
print(sample_small)


this is some normalized text


## Helper Function for Text Cleaning:

Implement a Helper Function as per Text Preprocessing Notebook and Complete the following pipeline.

# Build a Text Cleaning Pipeline

In [14]:
def text_cleaning_pipeline(dataset, rule = "lemmatize"):
  """
  This...
  """
  # Convert the input to small/lower order.
  data = lower_order(dataset)
  # Remove URLs
  data = remove_urls(dataset)
  # Remove emojis
  data = remove_emoji(dataset)
  # Remove all other unwanted characters.
  data = removeunwanted_characters(dataset)
  # Create tokens.
  tokens = data.split()
  # Remove stopwords:
  tokens = remove_stopwords(tokens)
  if rule == "lemmatize":
    tokens = lemmatization(tokens)
  elif rule == "stem":
    tokens = stemming(tokens)
  else:
    print("Pick between lemmatize or stem")


  return " ".join(tokens)


In [15]:
for dataset in df_text["text"][:10]:  # Select only the first 10 tweets
  print(text_cleaning_pipeline(dataset))

RTnot drain swamp taxpayer dollars trip advertise properties httpstcogFBvUkMX9z
ICYMI Hackers Rig FM Radio Stations To Play AntiTrump Song httpstcofV1J4HbXAt httpstco7kwDnuBUUd
Trump protest LGBTQ rally New York httpstcoLfHRD9Ft5I byvia
Hi Im Piers Morgan David Beckham awful Donald Trump ok
Tech Firm Suing BuzzFeed Publishing Unverified Trump DossierhttpstcoYvkaKaBdrJ httpstco1KLQSJfaKg
Alec Baldwin return Trump SNL evil Bannon actually president httpstcoRfndLqBLWV
REPUBLICANS YOU OWN YOUR TRUMP
For worldhttpstcoIrIzXPEm1f
A federal appeal court refuse immediately reinstate Trumps travel ban people httpstconcMHPwJYJA byvia
Court deny Trump request immediately restore travel ban httpstcoNNdNquqZLo


In [16]:
cleaned_tokens = data["text"].apply(lambda dataset: text_cleaning_pipeline(dataset))

In [17]:
cleaned_tokens[200]

'5B Melania never move w Trumpthat cover salaries 10k school teachers'

In [18]:

from sklearn.model_selection import train_test_split

X = cleaned_tokens
y = data['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)


Training data shape: (24884,)
Testing data shape: (6221,)


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the testing data (using the vocabulary learned from the training data)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("Training TF-IDF shape:", X_train_tfidf.shape)
print("Testing TF-IDF shape:", X_test_tfidf.shape)


Training TF-IDF shape: (24884, 24273)
Testing TF-IDF shape: (6221, 24273)


In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Drop rows with NaN values in 'Sentiment' column *before* creating cleaned_tokens
data = data.dropna(subset=['Sentiment'])

# Now extract X and y after removing NaN values
X = data["text"].apply(lambda dataset: text_cleaning_pipeline(dataset))
y = data['Sentiment']

# Continue with your train_test_split and model training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the testing data (using the vocabulary learned from the training data)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize and train the Logistic Regression model
model = LogisticRegression(max_iter=10000)  # Increased max_iter
model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model and print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.92      0.98      0.95      4492
         1.0       0.94      0.79      0.85      1729

    accuracy                           0.93      6221
   macro avg       0.93      0.88      0.90      6221
weighted avg       0.93      0.93      0.92      6221



In [25]:
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

# Extract true positives, true negatives, false positives, and false negatives
tn, fp, fn, tp = cm.ravel()

print(f"True Negatives: {tn}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"True Positives: {tp}")

# Calculate and print other metrics (optional)
accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * (precision * recall) / (precision + recall)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1_score}")


# Example of how to use new data.  Replace with your actual new data.
new_data = pd.Series([
    "This is a positive tweet about Donald Trump.",
    "This is a negative tweet about Donald Trump.",
    "This is a neutral tweet about something else.",
])

# Clean the new data using your pipeline
new_data_cleaned = new_data.apply(lambda x: text_cleaning_pipeline(x))

# Vectorize the new data using the same vectorizer fit on training data
new_data_tfidf = tfidf_vectorizer.transform(new_data_cleaned)

# Predict sentiments for the new data
new_predictions = model.predict(new_data_tfidf)

# Print the predictions
print("\nPredictions for new data:")
new_predictions


Confusion Matrix:
[[4402   90]
 [ 371 1358]]
True Negatives: 4402
False Positives: 90
False Negatives: 371
True Positives: 1358
Accuracy: 0.925896158173927
Precision: 0.9378453038674033
Recall: 0.7854251012145749
F1-Score: 0.8548945546112685

Predictions for new data:


array([0., 0., 0.])

# Text Classification using Machine Learning Models


### 📝 Instructions: Trump Tweet Sentiment Classification

1. **Load the Dataset**  
   Load the dataset named `"trump_tweet_sentiment_analysis.csv"` using `pandas`. Ensure the dataset contains at least two columns: `"text"` and `"label"`.

2. **Text Cleaning and Tokenization**  
   Apply a text preprocessing pipeline to the `"text"` column. This should include:
   - Lowercasing the text  
   - Removing URLs, mentions, punctuation, and special characters  
   - Removing stopwords  
   - Tokenization (optional: stemming or lemmatization)
   - "Complete the above function"

3. **Train-Test Split**  
   Split the cleaned and tokenized dataset into **training** and **testing** sets using `train_test_split` from `sklearn.model_selection`.

4. **TF-IDF Vectorization**  
   Import and use the `TfidfVectorizer` from `sklearn.feature_extraction.text` to transform the training and testing texts into numerical feature vectors.

5. **Model Training and Evaluation**  
   Import **Logistic Regression** (or any machine learning model of your choice) from `sklearn.linear_model`. Train it on the TF-IDF-embedded training data, then evaluate it using the test set.  
   - Print the **classification report** using `classification_report` from `sklearn.metrics`.
