In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer as CV
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer,PorterStemmer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
!unzip archive.zip

In [None]:
tp="Genre Classification Dataset/train_data.txt"
train=pd.read_csv(tp,sep=':::',names=['ID','TITLE','GENRE','DESCRIPTION'],engine='python')

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
train.isnull().sum()

In [None]:
tep="Genre Classification Dataset/test_data.txt"
test=pd.read_csv(tep,sep=":::",names=['id','TITLE','DESCRIPTION'],engine='python')

In [None]:
test.info()

In [None]:
test.describe()

In [None]:
test.isnull().sum()

In [None]:
plt.figure(figsize=(30,10))
sns.countplot(data=train,x='GENRE',order=train['GENRE'].value_counts().index)
plt.show()

In [None]:
train.head()

In [None]:
stemmer=LancasterStemmer()
nltk.download('stopwords')
stop_words=set(stopwords.words("english"))

In [None]:
nltk.download('punkt')

stop_words = set(stopwords.words("english"))  # Stopwords set

def clean_text(text):
    text = text.lower()
    text = re.sub(r'@\S+', '', text)  # replace twitter accounts with a space
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'.pic\S+', '', text)
    text = re.sub(r'#','',text)
    text = re.sub(r'[^a-zA-Z+]', ' ', text)  # Change to replace non-characters with a space
    text = "".join([i for i in text if i not in string.punctuation])
    words = nltk.word_tokenize(text)
    # Use the predefined stop_words variable instead of redefining it inside the function
    text = " ".join([i for i in words if i not in stop_words and len(i) > 2])
    text = re.sub(r"\s+", " ", text).strip()  # Replace multiple spaces with a single space
    return text

train["TextCleaning"] = train["DESCRIPTION"].apply(clean_text)
test["TextCleaning"] = test["DESCRIPTION"].apply(clean_text)

In [None]:
train

In [None]:
st = PorterStemmer()
train['TextCleaning'] = train['TextCleaning'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
test['TextCleaning'] = test['TextCleaning'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))


In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train['GENRE'] = le.fit_transform(train['GENRE'].values)

In [None]:
train_df = train.loc[:,['TextCleaning', 'GENRE']]
test_df = test.loc[:,['TextCleaning', 'TITLE']]
train_df.head(10)

In [None]:
X_train,X_test,y_train,y_test=train_test_split(train_df['TextCleaning'] ,train['GENRE'] , test_size=0.2 , shuffle=True , random_state = 42)
print(f'Split data into train and eval sets')
print(f'Traning Set\t: {len(X_train)}\nValidation Set\t: {len(X_test)}')

In [None]:
vectorize = TfidfVectorizer(stop_words='english', max_features=1000)

X_train_tfidf = vectorize.fit_transform(X_train)

X_test_tfidf = vectorize.transform(X_test)

In [None]:
from sklearn.svm import LinearSVC
sv_model = LinearSVC(max_iter=1000)
sv_model.fit(X_train_tfidf,y_train)
predict_sv=sv_model.predict(X_test_tfidf)
print(classification_report(y_test, predict_sv))
sv_accuracy = accuracy_score(y_test,predict_sv)
print('Support vector accuracy is: {:.2f}%'.format(sv_accuracy*100))

In [None]:
lr_model=LogisticRegression()
lr_model.fit(X_train_tfidf,y_train)
predict_lr=lr_model.predict(X_test_tfidf)
print(classification_report(y_test, predict_lr))
lr_accuracy = accuracy_score(y_test, predict_lr)
print('Logistic Regression accuracy is: {:.2f}%'.format(lr_accuracy * 100))

In [None]:
nv_model=MultinomialNB()
nv_model.fit(X_train_tfidf,y_train)
predict_nv=nv_model.predict(X_test_tfidf)
print(classification_report(y_test, predict_nv))
nv_accuracy = accuracy_score(y_test, predict_nv)
print('Logistic Regression accuracy is: {:.2f}%'.format(nv_accuracy * 100))

In [None]:
columns=['LogisticRegression', 'MultinomialNB','SVC']
accuracy= [lr_accuracy, nv_accuracy, sv_accuracy]

FinalResult=pd.DataFrame({'Algorithm':columns, 'Accuracy':accuracy})

FinalResult

In [None]:
fig,ax=plt.subplots(figsize=(15,5))
plt.plot(FinalResult.Algorithm,accuracy,label="Accuracy")
plt.legend()
plt.show()