Tanishq
AIML B2
21070126098

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import gensim
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Load train and test datasets
train_url = "/content/train.csv"
test_url = "/content/test.csv"
train_data = pd.read_csv(train_url)
test_data = pd.read_csv(test_url)

In [None]:
train_data.tail()

Unnamed: 0,Class Index,Title,Description
119995,1,Pakistan's Musharraf Says Won't Quit as Army C...,KARACHI (Reuters) - Pakistani President Perve...
119996,2,Renteria signing a top-shelf deal,Red Sox general manager Theo Epstein acknowled...
119997,2,Saban not going to Dolphins yet,The Miami Dolphins will put their courtship of...
119998,2,Today's NFL games,PITTSBURGH at NY GIANTS Time: 1:30 p.m. Line: ...
119999,2,Nets get Carter from Raptors,INDIANAPOLIS -- All-Star Vince Carter was trad...


In [None]:
# Preprocessing
nltk.download('all')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   U

In [None]:
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    tokens = [token.lower() for token in tokens if token.isalnum()]  # Remove symbols
    tokens = [token for token in tokens if token not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(tokens)

In [None]:
train_data['cleaned_description'] = train_data['Description'].apply(preprocess_text)
test_data['cleaned_description'] = test_data['Description'].apply(preprocess_text)

In [None]:
# Count Vectorization
count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(train_data['cleaned_description'])
X_test_count = count_vectorizer.transform(test_data['cleaned_description'])

In [None]:
# TFIDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['cleaned_description'])
X_test_tfidf = tfidf_vectorizer.transform(test_data['cleaned_description'])

In [None]:
# Word2Vec model
train_sentences = [doc.split() for doc in train_data['cleaned_description']]
word2vec_model = Word2Vec(train_sentences, vector_size=300, window=5, min_count=1, sg=0)

In [None]:
def document_vector_w2v(doc):
    tokens = doc.split()
    doc_vector = np.mean([word2vec_model.wv[token] for token in tokens if token in word2vec_model.wv], axis=0)
    return doc_vector

In [None]:
train_data['w2v_vector'] = train_data['cleaned_description'].apply(document_vector_w2v)
test_data['w2v_vector'] = test_data['cleaned_description'].apply(document_vector_w2v)

In [None]:
X_train_w2v = np.vstack(train_data['w2v_vector'])
X_test_w2v = np.vstack(test_data['w2v_vector'])

In [None]:
# Google News Word2Vec model
google_news_w2v_model = gensim.models.KeyedVectors.load_word2vec_format("/content/drive/MyDrive/GoogleNews-vectors-negative300.bin.gz", binary = True)  # Replace with actual path

In [None]:

def document_vector_google_w2v(doc):
    tokens = doc.split()
    doc_vector = np.mean([google_news_w2v_model[token] for token in tokens if token in google_news_w2v_model], axis=0)
    return doc_vector

In [None]:
train_data['google_w2v_vector'] = train_data['cleaned_description'].apply(document_vector_google_w2v)
test_data['google_w2v_vector'] = test_data['cleaned_description'].apply(document_vector_google_w2v)

In [None]:
X_train_google_w2v = np.vstack(train_data['google_w2v_vector'])
X_test_google_w2v = np.vstack(test_data['google_w2v_vector'])

In [None]:
# Split data for testing
y_train = train_data['Class Index']
y_test = test_data['Class Index']

In [None]:
# Models and Evaluation
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred)
    return report

# Results

In [None]:
results = {}

# Logistic Regresion

In [None]:
# Logistic Regression with CountVectorizer
logreg_cv = LogisticRegression(max_iter=1000)
logreg_cv_report = evaluate_model(logreg_cv, X_train_count, X_test_count, y_train, y_test)
results['Logistic Regression with CountVectorizer'] = logreg_cv_report

In [None]:
# Logistic Regression with TF-IDF Vectorizer
logreg_tfidf = LogisticRegression(max_iter=1000)
logreg_tfidf_report = evaluate_model(logreg_tfidf, X_train_tfidf, X_test_tfidf, y_train, y_test)
results['Logistic Regression with TF-IDF Vectorizer'] = logreg_tfidf_report

In [None]:
# Logistic Regression with Word2Vec
logreg_w2v = LogisticRegression(max_iter=1000)
logreg_w2v_report = evaluate_model(logreg_w2v, X_train_w2v, X_test_w2v, y_train, y_test)
results['Logistic Regression with Word2Vec'] = logreg_w2v_report

In [None]:
# Logistic Regression with GoogleNews Word2Vec
logreg_google_w2v = LogisticRegression(max_iter=1000)
logreg_google_w2v_report = evaluate_model(logreg_google_w2v, X_train_google_w2v, X_test_google_w2v, y_train, y_test)
results['Logistic Regression with GoogleNews Word2Vec'] = logreg_google_w2v_report

# SVC

In [None]:
# SVC with CountVectorizer
svc_cv = SVC()
svc_cv_report = evaluate_model(svc_cv, X_train_count, X_test_count, y_train, y_test)
results['SVC with CountVectorizer'] = svc_cv_report

In [None]:
# SVC with TF-IDF Vectorizer
svc_tfidf = SVC()
svc_tfidf_report = evaluate_model(svc_tfidf, X_train_tfidf, X_test_tfidf, y_train, y_test)
results['SVC with TF-IDF Vectorizer'] = svc_tfidf_report

In [None]:
# SVC with Word2Vec
svc_w2v = SVC()
svc_w2v_report = evaluate_model(svc_w2v, X_train_w2v, X_test_w2v, y_train, y_test)
results['SVC with Word2Vec'] = svc_w2v_report

In [None]:
# SVC with GoogleNews Word2Vec
svc_google_w2v = SVC()
svc_google_w2v_report = evaluate_model(svc_google_w2v, X_train_google_w2v, X_test_google_w2v, y_train, y_test)
results['SVC with GoogleNews Word2Vec'] = svc_google_w2v_report

# Random Forest

In [None]:
# Random Forest with CountVectorizer
rf_cv = RandomForestClassifier()
rf_cv_report = evaluate_model(rf_cv, X_train_count, X_test_count, y_train, y_test)
results['Random Forest with CountVectorizer'] = rf_cv_report

In [None]:
# Random Forest with TF-IDF Vectorizer
rf_tfidf = RandomForestClassifier()
rf_tfidf_report = evaluate_model(rf_tfidf, X_train_tfidf, X_test_tfidf, y_train, y_test)
results['Random Forest with TF-IDF Vectorizer'] = rf_tfidf_report

In [None]:
# Random Forest with Word2Vec
rf_w2v = RandomForestClassifier()
rf_w2v_report = evaluate_model(rf_w2v, X_train_w2v, X_test_w2v, y_train, y_test)
results['Random Forest with Word2Vec'] = rf_w2v_report

In [None]:
# Random Forest with GoogleNews Word2Vec
rf_google_w2v = RandomForestClassifier()
rf_google_w2v_report = evaluate_model(rf_google_w2v, X_train_google_w2v, X_test_google_w2v, y_train, y_test)
results['Random Forest with GoogleNews Word2Vec'] = rf_google_w2v_report