#### Import Essential Libraries

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sb
from matplotlib import pyplot as plt

# Sklearn Libraries
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Importing the train dataset
trainDF = pd.read_csv('../input/student-shopee-code-league-sentiment-analysis/train.csv')
trainDF.head()

In [None]:
print("Unique Ratings:", trainDF['rating'].unique())
print("Total Number of ratings in train set:", len(trainDF))

In [None]:
# Importing the test dataset
testDF = pd.read_csv('../input/student-shopee-code-league-sentiment-analysis/test.csv')
testDF.head()

In [None]:
print("Total Number of ratings in test set:", len(testDF))

## Train Test Split

In [None]:
# Train Test Split
X = trainDF['review']
y = trainDF['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Model Training with Pipeline

In [None]:
# Preprocessing the text data through a pipeline
pipeline = Pipeline([('countVectoriser', CountVectorizer()),
                     ('tf-idf', TfidfTransformer()),
                     ('clf', LogisticRegression(random_state=100))
                    ])

In [None]:
# Train the model
model = pipeline.fit(X_train, y_train)

In [None]:
print(model.score(X_train, y_train))

### Some Results from Using Different Classifiers

**Model Training Set Scores:**

Logistic Regression - 0.6160598733056332

Random Forest Classifier - 0.6803095838158164

Decision Tree Classifier - 0.9095003746338806

**Model Scores:**

Logistic Regression - 0.6111531152297852

Random Forest Classifier - 0.6654337890212586

In [None]:
y_test_pred = model.predict(X_test)
print(model.score(X_test, y_test))

**Model Test Set Scores:**

Logistic Regression - 0.483295303613391, 0.48452133637571093

Random Forest Classifier - 0.43289173449579404

Decision Tree Classifier - 0.41967782583523483

In [None]:
# Creates a confusion matrix
cm = confusion_matrix(y_test, y_test_pred) 

# Transform to df for easier plotting
cm_df = pd.DataFrame(cm,
                     index = ['1', '2', '3', '4', '5'], 
                     columns = ['1', '2', '3', '4', '5'])

cm

In [None]:
# Training the final model on the entire dataset
fullModel = pipeline.fit(X, y)
print(fullModel.score(X, y))

## Make Submission

In [None]:
# Make predictions on test data
testRatings = fullModel.predict(testDF['review'])

# Prepare submission data frame
submission = pd.DataFrame({'review_id': testDF['review_id'].tolist(),
                           'rating': testRatings.tolist()
                          })

In [None]:
submission.head()

In [None]:
sampleSubmission = pd.read_csv('../input/student-shopee-code-league-sentiment-analysis/sampleSubmission.csv')
sampleSubmission.head()

In [None]:
submission.to_csv('The_OG_submission.csv', index = False)