# movie problem

# load data

In [None]:
# load data libraries
import numpy as np # linear algebra library
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import zipfile # to read zip files
from sklearn.model_selection import train_test_split


# data understanding libraries
import matplotlib.pyplot as plt # ploting library
%matplotlib inline
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from collections import Counter


# data preparation
import re
from nltk.stem import PorterStemmer


# ADS Creation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import StandardScaler

# Modeling
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB

# Evaluation and Model Selection
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn import metrics
from sklearn.model_selection import learning_curve
from sklearn.model_selection import GridSearchCV


# load data

In [None]:
#unzip the files
archive_train = zipfile.ZipFile('../input/sentiment-analysis-on-movie-reviews/train.tsv.zip')

#read training json file 
train = pd.read_csv('../input/sentiment-analysis-on-movie-reviews/train.tsv.zip',sep='\t')

#output the frist 5 rows
train.head()


# spliting data

In [None]:
train_data, test_data = train_test_split(train, test_size=0.4, random_state=1)
val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=1)

train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

### Length of each part

In [None]:
print("Train set size is ",len(train_data))
print("Val set size is ",len(val_data))
print("Test set size is ",len(test_data))

# 2.Data Understanding

### Data structure

In [None]:
train.info()

#### there is no nulls 
#### there are 156060 entry

## 2.2 What is the frequency of each sentiment?

In [None]:
# words num
count = " ".join([row["Phrase"] for ind , row in train_data.iterrows()]).split()
len(set(count))

In [None]:
# let's save list of sentiment we have
labels = train_data['Sentiment'].unique()

In [None]:
# plot the graph
fig, ax = plt.subplots(figsize=(15,10)) # create the plot and specify the figure size
plt.xlabel('Sentiment') # specify the x labels
plt.ylabel('Frequency') # specify the y labels
plt.title('Frequency of Sentiment') # specify the plot title
plt.bar(labels,train_data['Sentiment'].value_counts()) # create a bar plot
plt.grid() # show the grid
plt.show() # show the final plot

* from the given graph it seem that about 48000 sentiment is 2

# Number of words per phrase

In [None]:
# add column with num of words per phrase

train_data['Words_Num'] = train_data["Phrase"].apply(len)

# save list of the unique numbers we have
numbers = train_data["Phrase"].apply(len).unique()

In [None]:
fig, ax = plt.subplots(figsize=(30,10))
plt.bar(numbers,train_data["Words_Num"].value_counts().sort_index())
plt.xlabel('Number of words')
plt.ylabel('Phrase count')
plt.title('Number of Words per Phrase Count')
plt.xticks(np.arange(min(numbers), max(numbers)+1, 5.0)) # change x labels from the defult to the given range
plt.grid()
plt.show()

In [None]:
bins = range(0,200,10)

fig, ax = plt.subplots(figsize=(15,10))
plt.hist(train_data['Words_Num'], bins=bins, edgecolor="k") # output a histogram plot
plt.xlabel('Number of Words')
plt.ylabel('Phrase count')
plt.title('Number of words per Phrase Count')
plt.xticks(bins) # change x labels from the defult to the given range
plt.grid()
plt.show()

**Now its obvious that avarage number of words is from 10-60 for about 70000 to 80000 phrase**
##### the outliers lies in more than 170 words and less than 2

#### Cheeck for outliers

In [None]:
print("There are " ,len(train_data[train_data["Words_Num"]<2]), " Phrase with words less than 2.")
print("There are " ,len(train_data[train_data["Words_Num"]>170]), " Phrase with words more than 170.")

In [None]:
train_data[train_data["Words_Num"]<2][['Phrase']]

### Distribution of phrase with Words >170 over with sentiment

## cloud to see words

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
train_data[train_data["Words_Num"]>=170].groupby(['Sentiment']).size().sort_values().plot(kind='barh', ax=ax)
plt.title('Distribution of phrase with Words >170 over with sentiment')
plt.ylabel('Sentiment')
plt.xlabel('Number of words')
plt.grid()
plt.show()


## Frequency of words

In [None]:
# spliting data
final = " ".join([row["Phrase"] for ind , row in train.iterrows()]).split()



fig, ax = plt.subplots(figsize=(10,8))
lst = Counter(final).most_common(15)
df = pd.DataFrame(lst, columns = ['final', 'Count'])
df.plot.bar(x='final',y='Count', ax=ax)
plt.title('15 Most Frequent word')
plt.ylabel('Frequency')
plt.xlabel('words')
plt.xticks(rotation=90)

In [None]:
wordcloud = WordCloud(width = 1000, height = 500).generate(' '.join(final))
plt.figure(figsize=(15,8))
plt.imshow(wordcloud)
plt.title("Most Used words")
plt.axis("off")
plt.show()

In [None]:
train_data['SplitPhrase'] =train_data['Phrase'].str.split()
train_data['Phrase'] 

In [None]:

counters = {}
for Sentiment in train_data['Sentiment'].unique():
    counters[Sentiment] = Counter()
    indices = (train_data['Sentiment'] == Sentiment)
    for SplitPhrase in train_data[indices]['SplitPhrase']:
        counters[Sentiment].update(SplitPhrase)

        
fig, axes = plt.subplots(1, 5, figsize=(10,10),sharex='col', sharey='row')
for Sentiment, ax_index in zip(counters, range(1,21)): 
    wordcloud = WordCloud(background_color="white")
    wordcloud.generate_from_frequencies(frequencies=counters[Sentiment])
    fig.add_subplot(3, 5, ax_index)    
    plt.title(Sentiment)
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")

# Data prepartion

#### another look on data

In [None]:
train_data['SplitPhrase'].sample(200)

In [None]:
Splited = pd.Series((' '.join([','.join(row["SplitPhrase"]) for ind,row in train_data.iterrows()])).split(','))

In [None]:
pd.Series([s for s in final if "-" in s]).unique()

In [None]:
pd.Series([a for a in final if "'" in a]).unique()

### stop words


In [None]:
stopwords = set(["'s","-","","the","that","an"])
porter = PorterStemmer()
# lancaster=LancasterStemmer()

def ret_words(final):
    word_text = ' '.join(final)
    word_text  = final.lower()
    word_text  = final.replace('-', '')
    word_text  = final.replace(',', ' ')
    words = []
    for word in word_text .split():
        if len(word) <= 1: continue
        if re.findall('[0-9]', word): continue
        if '’' in word: continue
        if '-' in word: continue
        if word in stopwords: continue
        if re.findall('[^a-zA-Z]',re.sub(r'[^\w\s]','',word)): continue
        if len(word) > 0: words.append(porter.stem(re.sub(r'[^\w\s]','',word)))
    return ' '.join(words)

def preprocess(df,flag):

    
    # Convert list of ingredients to string
    df['final'] = df["Phrase"].apply(ret_words)
    
    return df

In [None]:
train_preprocessed = preprocess(train_data,0)
val_preprocessed = preprocess(val_data,1)
test_preprocessed = preprocess(test_data,1)

## look on data after cleansing

In [None]:
train_preprocessed.head(30)

In [None]:
len(set(pd.Series(' '.join([row["final"] for ind,row in train_preprocessed.iterrows()]).split(' '))))

In [None]:
id_train, X_train, y_train = train_preprocessed['PhraseId'], train_preprocessed['final'], train_preprocessed['Sentiment']
id_test, X_test, y_test = test_preprocessed['PhraseId'], test_preprocessed['final'], test_preprocessed['Sentiment']

In [None]:
# BoW
BoW = CountVectorizer()

BoW.fit(X_train)
Count_data = BoW.transform(X_train)

BoW_X_train = pd.DataFrame(Count_data.toarray(),columns=BoW.get_feature_names())

BoW_X_train

In [None]:
X_train.head()

In [None]:
BoW.fit(X_train.head())
Count_data = BoW.transform(X_train.head())
BoW_X_train = pd.DataFrame(Count_data.toarray(),columns=BoW.get_feature_names())
BoW_X_train

In [None]:
#TFIDF
TFIDF = TfidfVectorizer(sublinear_tf=True, min_df=5, max_df=0.25, norm='l2', encoding='latin-1',\
                ngram_range=(1, 2), stop_words='english')

TFIDF.fit(X_train)
Count_data = TFIDF.transform(X_train)
TFIDF_X_train = pd.DataFrame(Count_data.toarray(),columns=TFIDF.get_feature_names())


TFIDF_X_train

In [None]:
TFIDF = TfidfVectorizer()
TFIDF.fit(X_train.head(5))
Count_data = TFIDF.transform(X_train.head(5))
TFIDF_X_train = pd.DataFrame(Count_data.toarray(),columns=TFIDF.get_feature_names())


TFIDF_X_train

# Modelling

In [None]:
id_train, X_train, y_train = train_preprocessed['PhraseId'], train_preprocessed['final'], train_preprocessed['Sentiment']
id_val, X_val, y_val = val_preprocessed['PhraseId'], val_preprocessed['final'], val_preprocessed['Sentiment']
id_test, X_test, y_test = test_preprocessed['PhraseId'], test_preprocessed['final'], test_preprocessed['Sentiment']

Bow

In [None]:
LR_clf_counts = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', LogisticRegression(random_state=0, max_iter=2000))
])
LR_clf_counts.fit(X_train, y_train)
LR_cnt_pred_tr = LR_clf_counts.predict(X_train)

print(accuracy_score(y_train, LR_cnt_pred_tr))
print(precision_score(y_train, LR_cnt_pred_tr, average='weighted'))

In [None]:
# Create CV training and test scores for various training set sizes
train_sizes, train_scores, test_scores = learning_curve(LR_clf_counts, 
                                                        X_train, 
                                                        y_train,
                                                        # Number of folds in cross-validation
                                                        cv=3,
                                                        # Evaluation metric
                                                        scoring='precision_weighted',
                                                        # Use all computer cores
                                                        n_jobs=-1, 
                                                        # 50 different sizes of the training set
                                                        train_sizes=np.linspace(0.01, 1.0, 10))

# Create means and standard deviations of training set scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

# Create means and standard deviations of test set scores
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
fig, ax = plt.subplots(figsize=(15,10))
# Draw lines
plt.plot(train_sizes, train_mean, '--', color="#111111",  label="Training score")
plt.plot(train_sizes, test_mean, color="#111111", label="Cross-validation score")

# Draw bands
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="#DDDDDD")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="#DDDDDD")

# Create plot
plt.title("Learning Curve")
plt.xlabel("Training Set Size"), plt.ylabel("Weighted Precision Score"), plt.legend(loc="best")
plt.tight_layout()
plt.show()

SVM

In [None]:
SVM_clf_counts = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', LinearSVC(max_iter=3000))
])
SVM_clf_counts.fit(X_train, y_train)
SVM_cnt_pred_tr = SVM_clf_counts.predict(X_train)

print(accuracy_score(y_train, SVM_cnt_pred_tr))
print(precision_score(y_train, SVM_cnt_pred_tr, average='weighted'))

In [None]:
# Create CV training and test scores for various training set sizes
train_sizes, train_scores, test_scores = learning_curve(SVM_clf_counts, 
                                                        X_train, 
                                                        y_train,
                                                        # Number of folds in cross-validation
                                                        cv=3,
                                                        # Evaluation metric
                                                        scoring='precision_weighted',
                                                        # Use all computer cores
                                                        n_jobs=-1, 
                                                        # 50 different sizes of the training set
                                                        train_sizes=np.linspace(0.01, 1.0, 10))

# Create means and standard deviations of training set scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

# Create means and standard deviations of test set scores
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
fig, ax = plt.subplots(figsize=(15,10))
# Draw lines
plt.plot(train_sizes, train_mean, '--', color="#111111",  label="Training score")
plt.plot(train_sizes, test_mean, color="#111111", label="Cross-validation score")

# Draw bands
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="#DDDDDD")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="#DDDDDD")

# Create plot
plt.title("Learning Curve")
plt.xlabel("Training Set Size"), plt.ylabel("Weighted Precision Score"), plt.legend(loc="best")
plt.tight_layout()
plt.show()

Naive base

In [None]:
NB_clf_counts = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB())
])
NB_clf_counts.fit(X_train, y_train)
NB_cnt_pred_tr = NB_clf_counts.predict(X_train)

print(accuracy_score(y_train, NB_cnt_pred_tr))
print(precision_score(y_train, NB_cnt_pred_tr, average='weighted'))

In [None]:
# Create CV training and test scores for various training set sizes
train_sizes, train_scores, test_scores = learning_curve(NB_clf_counts, 
                                                        X_train, 
                                                        y_train,
                                                        # Number of folds in cross-validation
                                                        cv=3,
                                                        # Evaluation metric
                                                        scoring='precision_weighted',
                                                        # Use all computer cores
                                                        n_jobs=-1, 
                                                        # 50 different sizes of the training set
                                                        train_sizes=np.linspace(0.01, 1.0, 10))

# Create means and standard deviations of training set scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

# Create means and standard deviations of test set scores
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
fig, ax = plt.subplots(figsize=(15,10))
# Draw lines
plt.plot(train_sizes, train_mean, '--', color="#111111",  label="Training score")
plt.plot(train_sizes, test_mean, color="#111111", label="Cross-validation score")

# Draw bands
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="#DDDDDD")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="#DDDDDD")

# Create plot
plt.title("Learning Curve")
plt.xlabel("Training Set Size"), plt.ylabel("Weighted Precision Score"), plt.legend(loc="best")
plt.tight_layout()
plt.show()

### lR and tfidf

In [None]:
LR_clf_tfidf = Pipeline([
    ('tfidf', TfidfVectorizer(sublinear_tf=True, min_df=5, max_df=0.25, norm='l2', encoding='latin-1',ngram_range=(1, 2), stop_words='english')),
    ('clf', LogisticRegression(random_state=0, max_iter=2000))
])
LR_clf_tfidf.fit(X_train, y_train)
LR_tfidf_pred_tr = LR_clf_tfidf.predict(X_train)

print(accuracy_score(y_train, LR_tfidf_pred_tr))
print(precision_score(y_train, LR_tfidf_pred_tr, average='weighted'))

In [None]:
# Create CV training and test scores for various training set sizes
train_sizes, train_scores, test_scores = learning_curve(LR_clf_tfidf, 
                                                        X_train, 
                                                        y_train,
                                                        # Number of folds in cross-validation
                                                        cv=3,
                                                        # Evaluation metric
                                                        scoring='precision_weighted',
                                                        # Use all computer cores
                                                        n_jobs=-1, 
                                                        # 50 different sizes of the training set
                                                        train_sizes=np.linspace(0.01, 1.0, 10))

# Create means and standard deviations of training set scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

# Create means and standard deviations of test set scores
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
fig, ax = plt.subplots(figsize=(15,10))
# Draw lines
plt.plot(train_sizes, train_mean, '--', color="#111111",  label="Training score")
plt.plot(train_sizes, test_mean, color="#111111", label="Cross-validation score")

# Draw bands
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="#DDDDDD")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="#DDDDDD")

# Create plot
plt.title("Learning Curve")
plt.xlabel("Training Set Size"), plt.ylabel("Weighted Precision Score"), plt.legend(loc="best")
plt.tight_layout()
plt.show()

In [None]:
SVM_clf_tfidf = Pipeline([
    ('tfidf', TfidfVectorizer(sublinear_tf=True, min_df=5, max_df=0.25, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')),
    ('clf', LinearSVC( max_iter=2000))
])
SVM_clf_tfidf.fit(X_train, y_train)
SVM_tfidf_pred_tr = SVM_clf_tfidf.predict(X_train)

print(accuracy_score(y_train, SVM_tfidf_pred_tr))
print(precision_score(y_train, SVM_tfidf_pred_tr, average='weighted'))

In [None]:
# Create CV training and test scores for various training set sizes
train_sizes, train_scores, test_scores = learning_curve(SVM_clf_tfidf, 
                                                        X_train, 
                                                        y_train,
                                                        # Number of folds in cross-validation
                                                        cv=3,
                                                        # Evaluation metric
                                                        scoring='precision_weighted',
                                                        # Use all computer cores
                                                        n_jobs=-1, 
                                                        # 50 different sizes of the training set
                                                        train_sizes=np.linspace(0.01, 1.0, 10))

# Create means and standard deviations of training set scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

# Create means and standard deviations of test set scores
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
fig, ax = plt.subplots(figsize=(15,10))
# Draw lines
plt.plot(train_sizes, train_mean, '--', color="#111111",  label="Training score")
plt.plot(train_sizes, test_mean, color="#111111", label="Cross-validation score")

# Draw bands
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="#DDDDDD")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="#DDDDDD")

# Create plot
plt.title("Learning Curve")
plt.xlabel("Training Set Size"), plt.ylabel("Weighted Precision Score"), plt.legend(loc="best")
plt.tight_layout()
plt.show()

In [None]:
NB_clf_tfidf = Pipeline([
    ('tfidf', TfidfVectorizer(sublinear_tf=True, min_df=5, max_df=0.25, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')),
    ('clf', MultinomialNB())
])
NB_clf_tfidf.fit(X_train, y_train)
NB_tfidf_pred_tr = NB_clf_tfidf.predict(X_train)

print(accuracy_score(y_train, NB_tfidf_pred_tr))
print(precision_score(y_train, NB_tfidf_pred_tr, average='weighted'))

In [None]:
# Create CV training and test scores for various training set sizes
train_sizes, train_scores, test_scores = learning_curve(NB_clf_tfidf, 
                                                        X_train, 
                                                        y_train,
                                                        # Number of folds in cross-validation
                                                        cv=3,
                                                        # Evaluation metric
                                                        scoring='precision_weighted',
                                                        # Use all computer cores
                                                        n_jobs=-1, 
                                                        # 50 different sizes of the training set
                                                        train_sizes=np.linspace(0.01, 1.0, 10))

# Create means and standard deviations of training set scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

# Create means and standard deviations of test set scores
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
fig, ax = plt.subplots(figsize=(15,10))
# Draw lines
plt.plot(train_sizes, train_mean, '--', color="#111111",  label="Training score")
plt.plot(train_sizes, test_mean, color="#111111", label="Cross-validation score")

# Draw bands
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="#DDDDDD")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="#DDDDDD")

# Create plot
plt.title("Learning Curve")
plt.xlabel("Training Set Size"), plt.ylabel("Weighted Precision Score"), plt.legend(loc="best")
plt.tight_layout()
plt.show()

In [None]:
### hyper tunning

In [None]:
vect=  CountVectorizer()
X_train_cnt = vect.fit_transform(X_train)


In [None]:
def svc_param_selection(X, y, nfolds, kernal):
    Cs = [ 0.1, 1, 10]
    gammas = [0.01, 0.1, 1]
    degrees = [0, 1, 2, 3]
    rbf_param_grid = {'C': Cs, 'gamma' : gammas}
    linear_param_grid = {'C': Cs}
    poly_param_grid = {'C': Cs, 'gamma' : gammas, 'degree':degrees}
    if kernal == 'rbf':
        grid_search = GridSearchCV(SVC(kernel=kernal), rbf_param_grid, cv=nfolds)
    elif kernal == 'linear':
        grid_search = GridSearchCV(SVC(kernel=kernal), linear_param_grid, cv=nfolds)
    else:
        grid_search = GridSearchCV(SVC(kernel=kernal), poly_param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_


## final model

In [None]:
vect=  CountVectorizer()
X_train_cnt = vect.fit_transform(X_train)

In [None]:
def svc_param_selection(X, y, nfolds, kernal):
    Cs = [ 0.1, 1, 10]
    gammas = [0.01, 0.1, 1]
    degrees = [0, 1, 2, 3]
    rbf_param_grid = {'C': Cs, 'gamma' : gammas}
    linear_param_grid = {'C': Cs}
    poly_param_grid = {'C': Cs, 'gamma' : gammas, 'degree':degrees}
    if kernal == 'rbf':
        grid_search = GridSearchCV(SVC(kernel=kernal), rbf_param_grid, cv=nfolds)
    elif kernal == 'linear':
        grid_search = GridSearchCV(SVC(kernel=kernal), linear_param_grid, cv=nfolds)
    else:
        grid_search = GridSearchCV(SVC(kernel=kernal), poly_param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_

In [None]:
SVM_clf_counts_lin = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', SVC(C=0.1, kernel='linear'))
])
SVM_clf_counts_lin.fit(X_train, y_train)
SVM_cnt_pred_tr_lin = SVM_clf_counts_lin.predict(X_train)
SVM_cnt_pred_val_lin = SVM_clf_counts_lin.predict(X_val)
SVM_tst = SVM_clf_counts_lin.predict(X_test)



print("precision on training: ",precision_score(y_train, SVM_cnt_pred_tr_lin, average='micro'))
print("precision on validation: ",precision_score(y_val, SVM_cnt_pred_val_lin, average='micro'))
print("precision on testing: ",precision_score(y_test, SVM_tst, average='micro'))


In [None]:
archive_train = zipfile.ZipFile('../input/sentiment-analysis-on-movie-reviews/train.tsv.zip')
archive_test = zipfile.ZipFile('../input/sentiment-analysis-on-movie-reviews/test.tsv.zip')

In [None]:
final_train = pd.read_csv("../input/sentiment-analysis-on-movie-reviews/train.tsv.zip", sep='\t')
final_test = pd.read_csv("../input/sentiment-analysis-on-movie-reviews/test.tsv.zip", sep='\t')

In [None]:
ftrain_preprocessed = preprocess (final_train,0)
ftest_preprocessed = preprocess (final_test,1)

In [None]:
id_train, X_train, y_train = ftrain_preprocessed['PhraseId'], ftrain_preprocessed['final'], ftrain_preprocessed['Sentiment']
id_test, X_test= ftest_preprocessed['PhraseId'], ftest_preprocessed['final']

In [None]:
SVM_clf= Pipeline([
    ('vect', CountVectorizer()),
    ('clf', SVC(C=0.1, kernel='linear'))
])
SVM_clf.fit(X_train , y_train)
pred_tst = SVM_clf_counts_lin.predict(X_test)

In [None]:
output=pd.DataFrame({'PhraseId' : id_test , 'Sentiment' : pred_tst })
output.to_csv('Sentiment_preds_LR.csv' , index=False)