<a href="https://colab.research.google.com/github/Satorumi/Machine-Learning/blob/main/ML_Tutorial_with_sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Load Data**

In [None]:
# Fetch a single <1MB file using the raw GitHub URL.
!curl --remote-name \
     -H 'Accept: application/vnd.github.v3.raw' \
     --location https://raw.githubusercontent.com/Satorumi/sklearn/master/data/sentiment/Books_small_10000.json

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 7874k  100 7874k    0     0  13.7M      0 --:--:-- --:--:-- --:--:-- 13.7M


#### Create Classes

In [None]:
import random 

class Review: # define a class to store reviews
  def __init__(self, text, score):
    self.text = text
    self.score = score
    self.feedback = self.get_feedback()

  def get_feedback(self):
    if self.score <= 2:
      return 'Negative Feedback'
    elif self.score < 4:
      return 'Neutral Feedback'
    else:
      return 'Positive Feedback'

class reviewContainer:
  def __init__(self, reviews):
    self.reviews = reviews

  def get_text(self):
    return [review.text for review in self.reviews]

  def get_feedback(self):
    return [review.feedback for review in self.reviews]

  def evenly_distribute(self):
    negative = list(filter(lambda x: x.feedback == 'Negative Feedback', self.reviews)) # filter out only negative feedback
    positive = list(filter(lambda x: x.feedback == 'Positive Feedback', self.reviews))[:len(negative)] # make postive and neutral same len as negative
    neutral = list(filter(lambda x: x.feedback == 'Neutral Feedback', self.reviews))[:len(negative)]
    self.reviews = negative + positive + neutral
    random.shuffle(self.reviews)

In [None]:
import json

filename = 'Books_small_10000.json'

reviews = [] # store review
with open(filename) as f:
  for line in f:
    review = json.loads(line) # load into python obj
    reviews.append(Review(review['reviewText'], review['overall']))



## **Prep Data**

In [None]:
from sklearn.model_selection import train_test_split

# split data for train and test part
train, test = train_test_split(reviews, test_size=0.33, random_state=42) 

train_evenly_distribute = reviewContainer(train)
test_evenly_distribute = reviewContainer(test)
# apply to get evenly distribute negative and positive feedback
train_evenly_distribute.evenly_distribute()
test_evenly_distribute.evenly_distribute()

# seperate data
X_train = train_evenly_distribute.get_text() # encoding fields
y_train = train_evenly_distribute.get_feedback() # predict target

X_test = test_evenly_distribute.get_text() # encoding fields
y_test = test_evenly_distribute.get_feedback() # predict target
len(y_train)

1308

### **Bag of Words**: Count Vectorizers

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer() # less focus on frequent words
X_train_vectors = vectorizer.fit_transform(X_train) # create a vectorized matrix for all text review from X_train
X_test_vectors = vectorizer.transform(X_test)
X_train_vectors[0] # a matrix store vectorized first text from X train

<1x11465 sparse matrix of type '<class 'numpy.float64'>'
	with 96 stored elements in Compressed Sparse Row format>

### **Model Selection**

#### Support Vector Machines

In [None]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear') # use C-Support Vector Classification
clf_svm.fit(X_train_vectors, y_train) # fit data to predict
svm_pred = clf_svm.predict(X_test_vectors) # predict

#### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
tree.fit(X_train_vectors, y_train)
tree_predict = tree.predict(X_test_vectors)


#### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train_vectors.toarray(), y_train) # fit data
gnb_pred = gnb.predict(X_test_vectors.toarray()) # predict

#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=42, max_iter=200)
lr.fit(X_train_vectors, y_train)
lr_pred = lr.predict(X_test_vectors)

### **Analysis and Evaluation**

In [None]:
# the mean accuracy on the given test data and labels
print(clf_svm.score(X_test_vectors, y_test))
print(tree.score(X_test_vectors, y_test))
print(gnb.score(X_test_vectors.toarray(), y_test))
print(lr.score(X_test_vectors, y_test))

0.6153846153846154
0.4439102564102564
0.4342948717948718
0.6217948717948718


In [None]:
# F1 Score
from sklearn.metrics import f1_score
print(f1_score(y_test, clf_svm.predict(X_test_vectors), average=None, labels=['Negative Feedback', 'Positive Feedback', 'Neutral Feedback']))
print(f1_score(y_test, tree.predict(X_test_vectors), average=None, labels=['Negative Feedback', 'Positive Feedback', 'Neutral Feedback']))
print(f1_score(y_test, gnb.predict(X_test_vectors.toarray()), average=None, labels=['Negative Feedback', 'Positive Feedback', 'Neutral Feedback']))
print(f1_score(y_test, lr.predict(X_test_vectors), average=None, labels=['Negative Feedback', 'Positive Feedback', 'Neutral Feedback']))

[0.61204819 0.69417476 0.5415677 ]
[0.44827586 0.47115385 0.41314554]
[0.43902439 0.46786632 0.40089087]
[0.62102689 0.70117647 0.5410628 ]


### **Tuning Model**

In [None]:
from sklearn.model_selection import GridSearchCV
# to test which is the best model selection for data

parameters = {'kernel': ('linear', 'rbf'),
              'C': [1, 4, 8, 16]}
svc = svm.SVC()              
clf = GridSearchCV(svc, parameters, cv=5,)
clf

GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1, 4, 8, 16], 'kernel': ('linear', 'rbf')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

### **Saving and Loading Model**

In [None]:
import pickle

# using pickle librabry to save model
with open('mode_name', 'wb') as f:
  pickle.dump(model, f)

In [None]:
# load model
with open('file_name', 'rb') as f:
  model_name = pickle.load(f)

# apply model
model_name.predict(X, y)