In [2]:
import pandas as pd

In [4]:
df = pd.read_csv('deceptive-opinion.csv')
df.head(3)

Unnamed: 0,deceptive,hotel,polarity,source,text
0,truthful,conrad,positive,TripAdvisor,We stayed for a one night getaway with family ...
1,truthful,hyatt,positive,TripAdvisor,Triple A rate with upgrade to view room was le...
2,truthful,hyatt,positive,TripAdvisor,This comes a little late as I'm finally catchi...


In [9]:
# df = df.drop(['hotel','source','polarity'], axis = 1)
df.head(2)

Unnamed: 0,deceptive,text
0,truthful,We stayed for a one night getaway with family ...
1,truthful,Triple A rate with upgrade to view room was le...


In [10]:
df['text'] = df['text'].map(lambda x: x.lower())
df.head(2)

Unnamed: 0,deceptive,text
0,truthful,we stayed for a one night getaway with family ...
1,truthful,triple a rate with upgrade to view room was le...


In [14]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\preet\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
stop = stopwords.words('english')
df['review_without_stopwords'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [17]:
from textblob import TextBlob
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\preet\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [18]:
def pos(review_without_stopwords):
    return TextBlob(review_without_stopwords).tags

In [21]:
nltk.download('averaged_perceptron_tagger')

os = df.review_without_stopwords.apply(pos)
os1 = pd.DataFrame(os)
os1['pos'] = os1['review_without_stopwords'].map(lambda x: " ".join(["".join(x) for x in x]))
df = pd.merge(df, os1, right_index=True, left_index=True)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\preet\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


In [22]:
from sklearn.model_selection import train_test_split

In [23]:
review_train, review_test, label_train, label_test = train_test_split(df['pos'], df['deceptive'], test_size=0.2, random_state=10)

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
tf_vect = TfidfVectorizer(lowercase=True, use_idf=True, smooth_idf=True, sublinear_tf=False)
X_train_tf = tf_vect.fit_transform(review_train)
X_test_tf = tf_vect.transform(review_test)

In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn import svm

In [28]:
def svc_param_selection(X, y, nfolds):
    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    param_grid = {'C': Cs, 'gamma': gammas}
    grid_search = GridSearchCV(svm.SVC(kernel='linear'), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    return grid_search.best_params_

In [29]:
best_params = svc_param_selection(X_train_tf, label_train, 5)
print("Best Parameters:", best_params)

Best Parameters: {'C': 1, 'gamma': 0.001}


In [30]:
clf = svm.SVC(C=best_params['C'], gamma=best_params['gamma'], kernel='linear')
clf.fit(X_train_tf, label_train)

In [34]:
import pickle

In [36]:
with open('svm_model.pkl', 'wb') as f:
    pickle.dump(clf, f)
print("Model saved successfully!")

Model saved successfully!


In [37]:
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tf_vect, f)
print("TF-IDF vectorizer saved successfully.")

TF-IDF vectorizer saved successfully.
