# Classification using Support Vector Machines (SVM)

In [None]:
import pandas as pd
import nltk
import collections
import sklearn
from datetime import datetime
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("netflix_titles.csv", sep=';')

In [None]:
df.head()

#### Select columns

In [None]:
data = df[['description']]
data.head()

### Cleaning data

##### Clean the punctuation

In [None]:
data['description_content'] = data['description'].str.replace('[^\w\s-]','')
data.head(2)

##### Change data to be lowercase or uppercase

In [None]:
data['description_content'] = data['description_content'].apply(lambda x: x.lower())
data.head(2)

##### Token data

In [None]:
data['description_token'] = data['description_content'].apply(lambda x: nltk.word_tokenize(x))
data.head(2)

##### Remove Stopwords

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
stop_words

In [None]:
data['description_end'] = data['description_token'].apply(lambda x: ' '.join([y for y in x if y not in (stop_words)]))
data.head(2)

###### Load label of data

In [None]:
label = pd.read_csv('label_netflix.txt')
label.head()

##### Encoder

In [None]:
fit_label = LabelEncoder().fit_transform(label)

In [None]:
fit_label[:9]

In [None]:
label[:9]

### Feature Extraction

In [None]:
tfidf = TfidfVectorizer(ngram_range=(1,1))

In [None]:
tfidf.fit(data['description_end'])

In [None]:
tfidf_after = tfidf.transform(data['description_end'])

In [None]:
name_columns = tfidf.get_feature_names()
name_columns

In [None]:
print(tfidf.vocabulary_)

In [None]:
tfidf_array = tfidf_after.toarray()
tfidf_array

In [None]:
tfidf_matrix = pd.DataFrame(tfidf_array, columns=name_columns)

In [None]:
tfidf_matrix

### Get test and train data

In [None]:
x_train, x_test, y_train, y_test= train_test_split(tfidf_array, fit_label, test_size=0.2, random_state=0)

In [None]:
print('train data : ', x_train.shape)
print('train label : ', y_train.shape)
print('test data : ', x_test.shape)
print('test label : ', y_test.shape)

### Classification using SVM

In [None]:
start_time = datetime.now()

svc = SVC(kernel='linear')
svc.fit(x_train, y_train)

end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

In [None]:
predictions_svm = svc.predict(x_test)

In [None]:
print("SVM Accuracy Score : ",accuracy_score(predictions_svm, y_test)*100)

For more information visit https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html