In [1]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

In [7]:
reviews_train = pd.read_csv('train.csv', delimiter = ',', usecols=['Reviews', 'Rating'], keep_default_na = False)
reviews_test = pd.read_csv('test.csv', delimiter = ',', usecols=['Id','Reviews'], keep_default_na = False)

In [8]:
stemmer = PorterStemmer()
words = stopwords.words("english")
reviews_train['Stemmed'] = reviews_train['Reviews'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())

In [9]:
vectorizer = TfidfVectorizer(min_df= 3, stop_words="english", sublinear_tf=True, norm='l2', ngram_range=(1, 2))

In [10]:
X = reviews_train['Stemmed']
Y = reviews_train['Rating']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0)

pipeline = Pipeline([('vect', vectorizer),
                     ('chi',  SelectKBest(chi2, k=1200)),
                     ('clf', RandomForestClassifier())])

model = pipeline.fit(X_train, y_train)

final = model.predict(reviews_test['Reviews'])
d = {'Id': reviews_test['Id'], 'Rating': final}
df = pd.DataFrame(data=d)
df.to_csv(path_or_buf = 'out.csv', index = False)

