In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report,accuracy_score

In [2]:
df=pd.read_csv('train_data.txt',encoding='UTF-8',sep=':::',names=['id','title','genre','des'],engine='python')

In [3]:
df.head()

Unnamed: 0,id,title,genre,des
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [4]:
y = df['genre']
df['des']=df['des'].str.lower()
x = df['des']

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=40)
tfi = TfidfVectorizer(stop_words='english')
x_train = tfi.fit_transform(x_train)
x_test = tfi.transform(x_test)

clr = LinearSVC(dual=False)
clr.fit(x_train,y_train)

In [5]:
y_pred = clr.predict(x_test)
print("accuracy :",accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))
l='Listening in to a conversation between his doctor and parents, 10-year-old Oscar learns what nobody has the courage to tell him. He only has a few weeks to live. Furious, he refuses to speak to anyone except straight-talking Rose, the lady in pink he meets on the hospital stairs. As Christmas approaches, Rose uses her fantastical experiences as a professional wrestler, her imagination, wit and charm to allow Oscar to live life and love to the full, in the company of his friends Pop Corn, Einstein, Bacon and childhood sweetheart Peggy Blue.'
l=l.lower()
l=tfi.transform([l])
l_pred = clr.predict(l)
print(l_pred[0])

accuracy : 0.5820344922991791
               precision    recall  f1-score   support

      action        0.47      0.34      0.40       247
       adult        0.80      0.43      0.56       137
   adventure        0.41      0.21      0.28       143
   animation        0.37      0.14      0.21       104
   biography        0.00      0.00      0.00        49
      comedy        0.53      0.57      0.55      1496
       crime        0.32      0.06      0.09       108
 documentary        0.69      0.83      0.75      2624
       drama        0.55      0.72      0.63      2744
      family        0.42      0.14      0.21       181
     fantasy        0.28      0.08      0.12        64
   game-show        0.78      0.74      0.76        34
     history        0.50      0.04      0.07        51
      horror        0.60      0.63      0.62       409
       music        0.64      0.54      0.59       154
     musical        0.56      0.08      0.15        59
     mystery        0.30      0.06

In [6]:
df=pd.read_csv("test_data.txt",encoding='latin-1',sep=':::',names=['id','title','des'],engine='python')
y_tes = df['des']
y_tes = y_tes.str.lower()
y_tes = tfi.transform(y_tes)
y_pre = clr.predict(y_tes)

dff = pd.read_csv("test_data_solution.txt",encoding='latin-1',sep=':::',names=['id','title','genre','des'],engine='python')
y_tee = dff['genre']
print(accuracy_score(y_tee,y_pre))

0.5805535055350554


In [7]:
## method 2

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [9]:
df = pd.read_csv('train_data.txt',encoding='UTF-8',sep=':::',names=['id','title','genre','des'],engine='python')
df['des']=df['des'].str.lower()
x = df['des']
y=df['genre']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=40)
tfi=TfidfVectorizer(stop_words='english')
x_train = tfi.fit_transform(x_train)
x_test = tfi.transform(x_test)

In [10]:
clr = LogisticRegression(max_iter=2000)
clr.fit(x_train,y_train)

In [11]:
y_pred = clr.predict(x_test)
print("accuracy :",accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred,zero_division=0))

accuracy : 0.5781610255464354
               precision    recall  f1-score   support

      action        0.51      0.21      0.29       247
       adult        0.82      0.20      0.33       137
   adventure        0.56      0.13      0.21       143
   animation        0.67      0.04      0.07       104
   biography        0.00      0.00      0.00        49
      comedy        0.53      0.56      0.54      1496
       crime        0.00      0.00      0.00       108
 documentary        0.65      0.87      0.74      2624
       drama        0.53      0.80      0.63      2744
      family        0.53      0.05      0.09       181
     fantasy        0.00      0.00      0.00        64
   game-show        0.89      0.47      0.62        34
     history        0.00      0.00      0.00        51
      horror        0.68      0.57      0.62       409
       music        0.70      0.34      0.46       154
     musical        0.00      0.00      0.00        59
     mystery        0.00      0.00

In [12]:
df=pd.read_csv("test_data.txt",encoding='latin-1',sep=':::',names=['id','title','des'],engine='python')
y_tes = df['des']
y_tes = y_tes.str.lower()
y_tes = tfi.transform(y_tes)
y_pre = clr.predict(y_tes)

dff = pd.read_csv("test_data_solution.txt",encoding='latin-1',sep=':::',names=['id','title','genre','des'],engine='python')
y_tee = dff['genre']
print(accuracy_score(y_tee,y_pre))

0.5799261992619926


In [13]:
## method 3

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report

In [15]:
df = pd.read_csv('train_data.txt',encoding='UTF-8',sep=':::',names=['id','title','genre','des'],engine='python')
df['des']=df['des'].str.lower()

x= df['des']
y= df['genre']

tfi = TfidfVectorizer(stop_words='english')
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=40)

x_train = tfi.fit_transform(x_train)
x_test = tfi.transform(x_test)

clr = MultinomialNB()
clr.fit(x_train,y_train)
y_pred = clr.predict(x_test)
print("accuracy :",accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred,zero_division=0))

accuracy : 0.4457253527621507
               precision    recall  f1-score   support

      action        0.00      0.00      0.00       247
       adult        0.00      0.00      0.00       137
   adventure        0.00      0.00      0.00       143
   animation        0.00      0.00      0.00       104
   biography        0.00      0.00      0.00        49
      comedy        0.69      0.04      0.07      1496
       crime        0.00      0.00      0.00       108
 documentary        0.53      0.90      0.67      2624
       drama        0.38      0.88      0.53      2744
      family        0.00      0.00      0.00       181
     fantasy        0.00      0.00      0.00        64
   game-show        0.00      0.00      0.00        34
     history        0.00      0.00      0.00        51
      horror        0.00      0.00      0.00       409
       music        0.00      0.00      0.00       154
     musical        0.00      0.00      0.00        59
     mystery        0.00      0.00

In [16]:
l='Listening in to a conversation between his doctor and parents, 10-year-old Oscar learns what nobody has the courage to tell him. He only has a few weeks to live. Furious, he refuses to speak to anyone except straight-talking Rose, the lady in pink he meets on the hospital stairs. As Christmas approaches, Rose uses her fantastical experiences as a professional wrestler, her imagination, wit and charm to allow Oscar to live life and love to the full, in the company of his friends Pop Corn, Einstein, Bacon and childhood sweetheart Peggy Blue.'
l=l.lower()
l=tfi.transform([l])
l_pred = clr.predict(l)
print(l_pred[0])

 drama 


In [17]:
df=pd.read_csv("test_data.txt",encoding='latin-1',sep=':::',names=['id','title','des'],engine='python')
y_tes = df['des']
y_tes = y_tes.str.lower()
y_tes = tfi.transform(y_tes)
y_pre = clr.predict(y_tes)

dff = pd.read_csv("test_data_solution.txt",encoding='latin-1',sep=':::',names=['id','title','genre','des'],engine='python')
y_tee = dff['genre']
print(accuracy_score(y_tee,y_pre))

0.4433394833948339


In [18]:
print(classification_report(y_tee,y_pre,zero_division=0))

               precision    recall  f1-score   support

      action        0.00      0.00      0.00      1314
       adult        0.00      0.00      0.00       590
   adventure        0.00      0.00      0.00       775
   animation        0.00      0.00      0.00       498
   biography        0.00      0.00      0.00       264
      comedy        0.76      0.04      0.08      7446
       crime        0.00      0.00      0.00       505
 documentary        0.52      0.90      0.66     13096
       drama        0.38      0.87      0.53     13612
      family        0.00      0.00      0.00       783
     fantasy        0.00      0.00      0.00       322
   game-show        0.00      0.00      0.00       193
     history        0.00      0.00      0.00       243
      horror        0.00      0.00      0.00      2204
       music        0.00      0.00      0.00       731
     musical        0.00      0.00      0.00       276
     mystery        0.00      0.00      0.00       318
        n