# import some libraries

In [8]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer as vec
from sklearn.svm import LinearSVC
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import cross_val_score
import re
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier
from sklearn.datasets import load_breast_cancer

wnl = WordNetLemmatizer()
 
print("rocks :", wnl.lemmatize("rocks",pos ='n'))
print("corpora :", wnl.lemmatize("corpora", pos = 'n'))


rocks : rock
corpora : corpus


# Take datasets from CSV files

In [19]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [16]:
df_train

Unnamed: 0,reviews_content,category
0,airplane ! is considered among many to be the ...,positive
1,you've got to love disney . \nno matter what t...,positive
2,""" the tailor of panama "" is a different kind ...",positive
3,"the characters in jonathan lynn's "" the whole ...",negative
4,"vikings v . bears ? \nno , this isn't the line...",negative
...,...,...
1495,"trekkies , roger nygard's energetic and hilari...",positive
1496,""" dangerous beauty "" is a really nothing more...",positive
1497,starring shawnee smith ; donovan leitch ; rick...,negative
1498,"man , this was one wierd movie . \nsimilar to ...",negative


# define function for reprocessing

In [5]:
def extract_alphabetic(text):
    # Use regular expression to remove non-alphabetic characters
    return re.sub(r'[^a-zA-Z\s]', '', text)
def wordnet_lemmatizer(text):
    words = text.split()
    output = [wnl.lemmatize(element) for element in words]
    return ' '.join(output)

In [20]:
df_train['processed text'] = df_train['reviews_content'].apply(extract_alphabetic)
df_train['processed text'] = df_train['processed text'].apply(wordnet_lemmatizer)

df_test['processed text'] = df_test['reviews_content'].apply(extract_alphabetic)
df_test['processed text'] = df_test['processed text'].apply(wordnet_lemmatizer)

# Perform TFIDF

In [24]:
vect = vec(ngram_range = (1,3),min_df = 8,max_df = 1000, norm= 'l2')
vect.fit(df_train['processed text'])
X_train = vect.transform(df_train['processed text'])
X_train = X_train.toarray()
df_train['category'] = df_train['category'].replace({'positive':1,'negative':0})
y_train = df_train['category']

X_test = vect.transform(df_test['processed text'])

In [22]:
df_train

Unnamed: 0,reviews_content,category,processed text
0,airplane ! is considered among many to be the ...,1,airplane is considered among many to be the ep...
1,you've got to love disney . \nno matter what t...,1,youve got to love disney no matter what they s...
2,""" the tailor of panama "" is a different kind ...",1,the tailor of panama is a different kind of sp...
3,"the characters in jonathan lynn's "" the whole ...",0,the character in jonathan lynns the whole nine...
4,"vikings v . bears ? \nno , this isn't the line...",0,viking v bear no this isnt the lineup for mond...
...,...,...,...
1495,"trekkies , roger nygard's energetic and hilari...",1,trekkies roger nygards energetic and hilarious...
1496,""" dangerous beauty "" is a really nothing more...",1,dangerous beauty is a really nothing more than...
1497,starring shawnee smith ; donovan leitch ; rick...,0,starring shawnee smith donovan leitch ricky pa...
1498,"man , this was one wierd movie . \nsimilar to ...",0,man this wa one wierd movie similar to conspir...


In [25]:


X_train, X_test, y_train, y_test = train_test_split(X_train, y_train,test_size=.5,random_state =123)

clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)


 97%|███████████████████████████████████████████████████████████████████████████████▏  | 28/29 [09:49<00:29, 29.26s/it]

[LightGBM] [Info] Number of positive: 369, number of negative: 381
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.082672 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 87223
[LightGBM] [Info] Number of data points in the train set: 750, number of used features: 3904
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.492000 -> initscore=-0.032003
[LightGBM] [Info] Start training from score -0.032003


100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [10:03<00:00, 20.83s/it]

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
LogisticRegression                 0.85               0.85     0.85      0.85   
PassiveAggressiveClassifier        0.85               0.85     0.85      0.85   
ExtraTreesClassifier               0.84               0.84     0.84      0.84   
LinearSVC                          0.84               0.84     0.84      0.84   
RidgeClassifierCV                  0.84               0.84     0.84      0.84   
RidgeClassifier                    0.84               0.84     0.84      0.84   
CalibratedClassifierCV             0.84               0.84     0.84      0.84   
Perceptron                         0.83               0.83     0.83      0.83   
NearestCentroid                    0.83               0.83     0.83      0.83   
NuSVC                              0.81               0.82     0.82      0.81   
SGDClassifier               




# fine-tuning regularization constant C

In [31]:
for i in np.arange(0.1,2,0.1):
    model = LinearSVC(tol = 0.001, C = i,dual ='auto')
    result = cross_val_score(model, X_train,y_train,cv = 15)
    print(f'C = {i:.2f} avg - {np.mean(result):.2f} median - {np.median(result):.2f}')

C = 0.10 avg - 0.84 median - 0.84
C = 0.20 avg - 0.85 median - 0.86
C = 0.30 avg - 0.86 median - 0.85
C = 0.40 avg - 0.86 median - 0.85
C = 0.50 avg - 0.86 median - 0.86
C = 0.60 avg - 0.86 median - 0.86
C = 0.70 avg - 0.86 median - 0.86
C = 0.80 avg - 0.86 median - 0.86
C = 0.90 avg - 0.87 median - 0.86
C = 1.00 avg - 0.87 median - 0.86
C = 1.10 avg - 0.87 median - 0.86
C = 1.20 avg - 0.87 median - 0.86
C = 1.30 avg - 0.87 median - 0.86
C = 1.40 avg - 0.87 median - 0.86
C = 1.50 avg - 0.87 median - 0.86
C = 1.60 avg - 0.87 median - 0.86
C = 1.70 avg - 0.87 median - 0.86
C = 1.80 avg - 0.87 median - 0.86
C = 1.90 avg - 0.87 median - 0.86


now we choose $C = 0.9$

# Train the model

In [8]:
model = LinearSVC(tol = 0.001, C = 0.9, dual = 'auto')
model.fit(X_train,y_train)
y_predict = model.predict(X_test)

# Make the CSV files to upload

In [34]:
output_file = pd.DataFrame({'Row': range(1,501), 'Label':y_predict})
output_file.to_csv('SVM_.csv',index = False)