# AG's News Topic Classification

**Importing Libraries required**

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report

**Creating Objects**

In [None]:
lem=WordNetLemmatizer()
tv=TfidfVectorizer()
rf=RandomForestClassifier()
nb=MultinomialNB()

**Loading dataset**

In [None]:
dftrain=pd.read_csv('C:/Users/Lenovo/Desktop/ag_news_csv/train.csv',names=['class','Title','Description'])
dftest=pd.read_csv('C:/Users/Lenovo/Desktop/ag_news_csv/test.csv',names=['class','Title','Description'])

**Dimension of dataset**

In [None]:
dftrain.shape

In [None]:
dftest.shape

**Checking presence of missing value in dataset**

In [None]:
print("Total missing values in training set:\n {}\n".format(dftrain.isnull().sum()))
print("Total missing values in testing set:\n {}\n".format(dftest.isnull().sum()))

---------------- No missing value present ----------------

**Checking Class imbalnce problem**

In [None]:
dftrain['class'].value_counts()

In [None]:
dftest['class'].value_counts()

---------------- There is no class imbalance in dataset ----------------

**Seprating independent(X) and dependent(Y) data**

In [None]:
Xtrain=dftrain.iloc[:,1:3].values
Xtest=dftest.iloc[:,1:3].values

Ytrain=dftrain.iloc[:,0].values
Ytest=dftest.iloc[:,0].values

**Text cleaning**

In [None]:
corpus1=[]
for i in range(len(Xtrain)):
    new1=re.sub('[^a-zA-Z]',' ',str(Xtrain[i])) # removing punction,numbers/ preserving only alphabets
    new1=new1.lower()                           # conversion to lower case
    words=nltk.word_tokenize(new1)              # splitting sentence into words
    new1=[lem.lemmatize(word) for word in words if word not in set(stopwords.words('english'))] # Lemmatization & removing stopwords
    new1=' '.join(new1) 
    corpus1.append(new1)

In [None]:
corpus2=[]
for i in range(len(Xtest)):
    new1=re.sub('[^a-zA-Z]',' ',str(Xtest[i]))
    new1=new1.lower()
    words=nltk.word_tokenize(new1)
    new1=[lem.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    new1=' '.join(new1)
    corpus2.append(new1)

**Feature Extraction using TF IDF**

In [None]:
Xtrain=tv.fit_transform(corpus1).toarray()
Xtest=tv.transform(corpus2).toarray()

**Fitting data on Random Forest model & Multinominal Naive Bayes model**

In [None]:
rf.fit(Xtrain,Ytrain) # Random Forest 
nb.fit(Xtrain,Ytrain) # Multinominal Naive Bayes 

**Predicting class for Xtest data using Random Forest model & Multinominal Naive Bayes model**

In [None]:
rfpredict=rf.predict(Xtest) # Random Forest 
nbpredict=nb.predict(Xtest) # Multinominal Naive Bayes 

**Model Evaluation**

In [None]:
print("Confusion Matrix of Random forest:\n {}".format(classification_report(Ytest,rfpredict)))
print("Confusion Matrix of Multinominal naive bayes:\n {}".format(classification_report(Ytest,nbpredict)))

In [None]:
rf_accuracy=accuracy_score(Ytest,rfpredict)
nb_accuracy=accuracy_score(Ytest,nbpredict)

if rf_accuracy>nb_accuracy:
    print("Random Forest is more accurate with accuracy of {}%".format(rf_accuracy*100))
    print("Accuracy of Multinominal naive bayes is {}%".format(nb_accuracy*100))
        
else:
    print("Multinominal naive bayes is more accurate with accuracy of {}%".format(nb_accuracy*100))
    print("Accuracy of Random forest is {}%".format(rf_accuracy*100))