# Loading Libraries

In [1]:
import numpy as np
import pandas as pd
import nltk
import spacy

In [2]:
# Load NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# Load the small SpaCy model
nlp = spacy.load('en_core_web_sm')

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Load dataset into a DataFrame
data = pd.read_csv('/content/drive/MyDrive/train.csv')
df = pd.DataFrame(data)
df

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."
...,...,...,...
119995,1,Pakistan's Musharraf Says Won't Quit as Army C...,KARACHI (Reuters) - Pakistani President Perve...
119996,2,Renteria signing a top-shelf deal,Red Sox general manager Theo Epstein acknowled...
119997,2,Saban not going to Dolphins yet,The Miami Dolphins will put their courtship of...
119998,2,Today's NFL games,PITTSBURGH at NY GIANTS Time: 1:30 p.m. Line: ...


# Preprocessing the Data

## 2.a) Removing Empty Rows and Duplicates

In [6]:
#Printing how many null values are present initially
df.isnull().sum()

Class Index    0
Title          0
Description    0
dtype: int64

In [7]:
#Checking for duplicates initially
df.duplicated(subset=['Description']).sum()

1277

In [8]:
#Dropping empty rows
df.dropna(subset=['Description'], inplace=True)

#Dropping dublicates
df.drop_duplicates(subset=['Description'], inplace=True)

In [9]:
#Checking how many null values are present after dropping
df.isnull().sum()

Class Index    0
Title          0
Description    0
dtype: int64

In [10]:
#Checking for duplicates after dropping
df.duplicated(subset=['Description']).sum()

0

## 2.b) Tokenization and Lemmatization

In [11]:
#Importing Tokenizer and Lemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

#Tokenizing and Lemmatizing Text
def tokenize_and_lemmatize(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

df['Description'] = df['Description'].apply(tokenize_and_lemmatize)

## 2.c) Data Cleansing

In [12]:
# Check the data type of the 'Description' column
print(df['Description'].dtype)

# Convert the 'Description' column to string if needed
df['Description'] = df['Description'].astype(str)

object


In [13]:
#Importing re library
import re

#Function to clean text
def clean_text(text):

    # Step 1: Remove URLs and HTTP tags
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'www\S+', '', text)    # Remove URLs
    text = re.sub(r'<.*?>', '', text)     # Remove HTML tags

    # Step 2: Remove emojis
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)  # Remove emojis

    # Step 3: Remove symbols and special characters
    text = re.sub(r'[^\w\s]', '', text)  # Remove non-alphanumeric characters

    # Step 4: Remove excess whitespaces
    text = ' '.join(text.split())  # Remove extra whitespaces

    return text

# Apply the clean_text function to the 'text' column
df['Description'] = df['Description'].apply(clean_text)

In [14]:
# Step 5: Remove stopwords

#Import stopwords from nltk
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
df['Description'] = df['Description'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word.lower() not in stop_words]))

In [15]:
#Print the cleaned text
print(df['Description'])

0         Reuters Shortsellers Wall Street dwindlingband...
1         Reuters Private investment firm Carlyle Group ...
2         Reuters Soaring crude price plus worriesabout ...
3         Reuters Authorities halted oil exportflows mai...
4         AFP Tearaway world oil price toppling record s...
                                ...                        
119995    KARACHI Reuters Pakistani President Pervez Mus...
119996    Red Sox general manager Theo Epstein acknowled...
119997    Miami Dolphins put courtship LSU coach Nick Sa...
119998    PITTSBURGH NY GIANTS Time 130 pm Line Steelers...
119999    INDIANAPOLIS AllStar Vince Carter wa traded To...
Name: Description, Length: 118723, dtype: object


## 2.d) Lowercase the strings, Replace Abbreviations and Fix Contractions

In [16]:
# Lowercasing the strings
df['Description'] = df['Description'].str.lower()

# Print data
df['Description'].head()

0    reuters shortsellers wall street dwindlingband...
1    reuters private investment firm carlyle group ...
2    reuters soaring crude price plus worriesabout ...
3    reuters authorities halted oil exportflows mai...
4    afp tearaway world oil price toppling record s...
Name: Description, dtype: object

In [17]:
# Define abbreviations
abbreviation_mapping = {
    'u': 'you',
    'r': 'are',
    'y': 'why',
}

In [18]:
# Define contractions
contraction_mapping = {
    "i'm": "i am",
    "you're": "you are",
    "they're":'they are'
}

In [19]:
# Function to replace abbreviations and contractions
def replace_abbreviations_and_contractions(text):
    for abbrev, replacement in abbreviation_mapping.items():
        text = text.replace(abbrev, replacement)
    for contraction, expanded in contraction_mapping.items():
        text = text.replace(contraction, expanded)
    return text

df['Description'] = df['Description'].apply(replace_abbreviations_and_contractions)

# LSTM Model

## 1. Single Layer LSTM with First Set of Results

In [20]:
#Import train_test_split
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Description'], df['Class Index'], test_size=0.2, random_state=42)

In [22]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Step 1: Tokenize text data
tokenizer = Tokenizer(num_words= 10000)
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Step 2: Pad sequences to a fixed length
X_train_padded = pad_sequences(X_train_sequences, maxlen= 50, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen= 50, padding='post')


In [23]:
#Importing necessary libraries
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Build and train the first LSTM model
model_1 = Sequential()
model_1.add(Embedding(input_dim= 10000, output_dim= 50, input_length= 50))
model_1.add(LSTM(32))
model_1.add(Dense(1, activation='sigmoid'))
model_1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_1.summary()

model_1.fit(X_train_padded, y_train, batch_size = 4, epochs= 5, validation_data=(X_test_padded, y_test))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 50)            500000    
                                                                 
 lstm (LSTM)                 (None, 32)                10624     
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 510,657
Trainable params: 510,657
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7e777153bc70>

## 2. Two Layer LSTM with Second Set of Results

In [24]:
# Tokenize text data
tokenizer = Tokenizer(num_words= 10000)
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Pad sequences to a fixed length
X_train_padded_2 = pad_sequences(X_train_sequences, maxlen= 30, padding='post')
X_test_padded_2 = pad_sequences(X_test_sequences, maxlen= 30, padding='post')


In [25]:
# Build and train the second LSTM model with two layers
model_2 = Sequential()
model_2.add(Embedding(input_dim= 25000, output_dim= 30, input_length= 30))
model_2.add(LSTM(32, return_sequences=True))
model_2.add(LSTM(32))
model_2.add(Dense(1, activation='sigmoid'))
model_2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_2.summary()

model_2.fit(X_train_padded_2, y_train, batch_size= 8, epochs=5 , validation_data=(X_test_padded_2, y_test))

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 30, 30)            750000    
                                                                 
 lstm_1 (LSTM)               (None, 30, 32)            8064      
                                                                 
 lstm_2 (LSTM)               (None, 32)                8320      
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 766,417
Trainable params: 766,417
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7e7789d2b310>

In [28]:
#Import classification_report from sklearn.metrics
from sklearn.metrics import classification_report

# Evaluate the first model
y_pred_1 = model_1.predict(X_test_padded)
y_pred_1 = (y_pred_1 > 0.5).astype(int)
report_1 = classification_report(y_test, y_pred_1)
print("Classification Report for Model 1:")
print(report_1)

# Evaluate the second model
y_pred_2 = model_2.predict(X_test_padded_2)
y_pred_2 = (y_pred_2 > 0.5).astype(int)
report_2 = classification_report(y_test, y_pred_2)
print("Classification Report for Model 2:")
print(report_2)




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report for Model 1:
              precision    recall  f1-score   support

           1       0.25      1.00      0.40      5862
           2       0.00      0.00      0.00      5968
           3       0.00      0.00      0.00      5949
           4       0.00      0.00      0.00      5966

    accuracy                           0.25     23745
   macro avg       0.06      0.25      0.10     23745
weighted avg       0.06      0.25      0.10     23745

Classification Report for Model 2:
              precision    recall  f1-score   support

           1       0.25      1.00      0.40      5862
           2       0.00      0.00      0.00      5968
           3       0.00      0.00      0.00      5949
           4       0.00      0.00      0.00      5966

    accuracy                           0.25     23745
   macro avg       0.06      0.25      0.10     23745
weighted avg       0.06      0.25      0.10     23745



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
