<a href="https://colab.research.google.com/github/RonitShetty/NLP-Labs/blob/main/C070_RonitShetty_NLPLab2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import numpy as np

#**Label Encoding**

In [2]:
# --- 1. Label Encoding ---
encoder = LabelEncoder()
labels = ["positive","negative","neutral","positive"]
encoded_labels = encoder.fit_transform(labels)
encoded_labels

array([2, 0, 1, 2])

#**One Hot Encoding**

In [3]:
# --- 2. Ordinal Encoding ---
data = np.array(labels).reshape(-1,1)
onehot_encoder = OneHotEncoder(sparse_output=False)
onehot_labels = onehot_encoder.fit_transform(data)
onehot_labels

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

#**BOW and TF-IDF on kaggle dataset (Tweets.csv)**

In [6]:
# --- Step 0: Import Necessary Libraries ---
# pandas is used for data manipulation and reading CSV files.
# nltk (Natural Language Toolkit) is used for text processing tasks like tokenization and stop word removal.
# scikit-learn (sklearn) is used for machine learning, from splitting data to building and evaluating models.

import pandas as pd
import numpy as np
import nltk
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# --- Step 1: Load and Prepare the Dataset ---
# We start by loading the 'Tweets.csv' file into a pandas DataFrame.
# A try-except block is used to handle the case where the file might not be found.
# We then select only the 'text' and 'airline_sentiment' columns, as they are the ones we need.
# Finally, `dropna()` removes any rows with missing values to ensure data quality.

print("--- Step 1: Loading and Preparing Data ---")
try:
    df = pd.read_csv('Tweets.csv')
except FileNotFoundError:
    print("Error: 'Tweets.csv' not found. Using a sample DataFrame for demonstration.")
    df = pd.DataFrame({
        'text': ["Sample positive tweet", "Sample negative tweet", "This is a neutral one."],
        'airline_sentiment': ["positive", "negative", "neutral"]
    })

df = df[['text', 'airline_sentiment']]
df.dropna(inplace=True)
print("Data loaded successfully.")
print("-" * 50)


# --- Step 2: Preprocess the Text Data ---
# This is a crucial step to clean and standardize the text data.
# 1. Tokenization: We break down each tweet into individual words (tokens).
# 2. Lowercasing: All words are converted to lowercase to treat words like "Flight" and "flight" as the same.
# 3. Stop Word Removal: Common English words that don't add much meaning (e.g., 'a', 'the', 'is') are removed.
# 4. Stemming: Words are reduced to their root form (e.g., 'flying', 'fly' -> 'fli'). This helps group related words.
# The `preprocess_text` function performs these steps, and we apply it to every tweet in our DataFrame.

print("--- Step 2: Preprocessing Text ---")
# FIX: Explicitly download the necessary NLTK data. This resolves the LookupError.
# The 'punkt' package is for tokenization and 'stopwords' is for the list of stop words.
nltk.download('stopwords')
nltk.download('punkt')

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    # Keep only alphabetic tokens and remove stop words before stemming
    stemmed_tokens = [stemmer.stem(word) for word in tokens if word.isalpha() and word not in stop_words]
    return " ".join(stemmed_tokens)

df['processed_text'] = df['text'].apply(preprocess_text)
print("Text preprocessing complete.")
print("-" * 50)


# --- Step 3: Inspect the Preprocessed Output (Optional but Recommended) ---
# Before moving on, it's a good practice to look at the results of our preprocessing.
# This helps verify that our cleaning function is working as expected. We'll look at the
# original vs. processed text and the list of tokens for the first tweet.

print("--- Step 3: Inspecting Processed Text and Tokens ---")
# Print the original vs. processed text for the first 5 tweets
print("Original vs. Processed Text:")
print(df[['text', 'processed_text']].head())
print("\nTokens from the first processed tweet:")
# Get the first processed tweet and split it to see the list of tokens
first_processed_tweet = df['processed_text'].iloc[0]
tokens = first_processed_tweet.split()
print(tokens)
print("-" * 50)


# --- Step 4: Split Data into Training and Testing Sets ---
# We split our dataset to train the model on one part (training set) and evaluate its
# performance on a separate, unseen part (testing set). This shows how well the model generalizes.
# `test_size=0.2` means 20% of the data is for testing.
# `stratify=y` ensures that the proportion of sentiments (positive, negative, neutral) is the same
# in both the training and testing sets, which is important for imbalanced datasets.

print("--- Step 4: Splitting Data ---")
X = df['processed_text']  # Features (the processed text)
y = df['airline_sentiment'] # Target (the sentiment labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training set size: {len(X_train)}, Testing set size: {len(X_test)}")
print("-" * 50)


# --- Step 5: Encode Target Labels ---
# Machine learning models require numerical inputs. Our sentiment labels ('positive', 'negative', 'neutral')
# are strings. `LabelEncoder` converts these strings into numbers (e.g., 0, 1, 2).
# We `fit_transform` on the training labels and only `transform` the test labels to prevent data leakage.

print("--- Step 5: Applying Label Encoding to Target Variable ---")
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Print the mapping to see which number corresponds to which sentiment
print("Label mapping:", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))
print("-" * 50)


# --- Step 6: Vectorize Text Data (BoW and TF-IDF) ---
# Here, we convert the cleaned text into numerical vectors that our model can understand.
# We will create two separate sets of features to compare their performance.

print("--- Step 6: Vectorizing Text ---")
# Method A: Bag-of-Words (BoW)
# BoW represents text by counting the frequency of each word. It's a simple and effective method.
# `max_features=2000` limits the vocabulary to the 2000 most frequent words.
bow_vectorizer = CountVectorizer(max_features=2000)
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)
print("BoW vectorization complete.")

# Method B: TF-IDF (Term Frequency-Inverse Document Frequency)
# TF-IDF scores words based on their importance. It gives higher weight to words that are frequent
# in a specific document but rare across all documents. This often performs better than BoW.
tfidf_vectorizer = TfidfVectorizer(max_features=2000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
print("TF-IDF vectorization complete.")
print("-" * 50)


# --- Step 7: Inspect the Vocabulary ---
# Let's look at the words the BoW vectorizer learned from the training data. This is our model's "vocabulary".
# The length should match the `max_features` we set earlier.

print("--- Step 7: Inspecting BoW Vocabulary ---")
bow_vocabulary = bow_vectorizer.get_feature_names_out()
print("Sample of BoW Vocabulary (first 100 words):")
print(list(bow_vocabulary[:100]))
print(f"\nTotal vocabulary size: {len(bow_vocabulary)}")
print("-" * 50)


# --- Step 8: Model Training and Evaluation ---
# Now we train our classification model (Logistic Regression) on the vectorized data and evaluate its performance.
# We do this separately for both BoW and TF-IDF features to see which representation works better.

print("--- Step 8: Training and Evaluating Models ---")
# Train and evaluate on Bag-of-Words (BoW) features
print("\n--- Model Training with BoW ---")
model_bow = LogisticRegression(max_iter=1000) # max_iter increased for convergence
model_bow.fit(X_train_bow, y_train_encoded) # Train the model

# Make predictions on the test set
y_pred_bow = model_bow.predict(X_test_bow)

# Evaluate the model
accuracy_bow = accuracy_score(y_test_encoded, y_pred_bow)
print(f"Accuracy with BoW features: {accuracy_bow:.4f}")
print("Classification Report (BoW):\n", classification_report(y_test_encoded, y_pred_bow, target_names=label_encoder.classes_))


# Train and evaluate on TF-IDF features
print("\n--- Model Training with TF-IDF ---")
model_tfidf = LogisticRegression(max_iter=1000)
model_tfidf.fit(X_train_tfidf, y_train_encoded) # Train the model

# Make predictions on the test set
y_pred_tfidf = model_tfidf.predict(X_test_tfidf)

# Evaluate the model
accuracy_tfidf = accuracy_score(y_test_encoded, y_pred_tfidf)
print(f"Accuracy with TF-IDF features: {accuracy_tfidf:.4f}")
print("Classification Report (TF-IDF):\n", classification_report(y_test_encoded, y_pred_tfidf, target_names=label_encoder.classes_))

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


--- Step 1: Loading and Preparing Data ---
Data loaded successfully.
--------------------------------------------------
--- Step 2: Preprocessing Text ---
Text preprocessing complete.
--------------------------------------------------
--- Step 3: Inspecting Processed Text and Tokens ---
Original vs. Processed Text:
                                                text  \
0                @VirginAmerica What @dhepburn said.   
1  @VirginAmerica plus you've added commercials t...   
2  @VirginAmerica I didn't today... Must mean I n...   
3  @VirginAmerica it's really aggressive to blast...   
4  @VirginAmerica and it's a really big bad thing...   

                                      processed_text  
0                        virginamerica dhepburn said  
1         virginamerica plu ad commerci experi tacki  
2  virginamerica today must mean need take anoth ...  
3  virginamerica realli aggress blast obnoxi ente...  
4                 virginamerica realli big bad thing  

Tokens from the