<a href="https://colab.research.google.com/github/OnlyourMiracle/MachineLearning/blob/master/Course/PythonMachineLearning/MovieClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 应用机器学习于情感分析

In [None]:
!tar -zxf /content/drive/MyDrive/MLIA/Data/aclImdb_v1.tar.gz

In [None]:
!pip install pyprind 
import pyprind
import pandas as pd
import os

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyprind
  Downloading PyPrind-2.11.3-py2.py3-none-any.whl (8.4 kB)
Installing collected packages: pyprind
Successfully installed pyprind-2.11.3


In [None]:
basepath = 'aclImdb'
labels = {'pos':1, 'neg':0}
pbar = pyprind.ProgBar(5000)
df = pd.DataFrame()
for s in ('test', 'train'):
  for l in ('pos', 'neg'):
    path = os.path.join(basepath, s, l)
    for file in os.listdir(path):
      with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
        txt = infile.read()
      df = df.append([[txt, labels[l]]], ignore_index=True)
      pbar.update()
df.columns=['review', 'sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:06


In [None]:
import numpy as np

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))

In [None]:
df.to_csv('/content/drive/MyDrive/MLIA/Data/movie_data.csv', index=False, encoding='utf-8')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/MLIA/Data/movie_data_cleaned.csv')

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,i am surprised that there is confusion over th...,1
1,had i known to what i was submitting myself i ...,0
2,i didn t enjoy this movie at all for one i jus...,0
3,as i ve noticed with a lot of imdb comments ce...,1
4,this engaging which it shouldn t be low grade ...,0


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
docs = np.array([
        'The sun is shining',
        'The weather is sweet',
        'The sun is shining, the weather is sweet, and one and one is two'])
bag = count.fit_transform(docs)
count.vocabulary_
print(bag.toarray())

{'the': 6,
 'sun': 4,
 'is': 1,
 'shining': 3,
 'weather': 8,
 'sweet': 5,
 'and': 0,
 'one': 2,
 'two': 7}

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
tfidf.fit_transform(count.fit_transform(docs)).toarray()

array([[0.        , 0.43370786, 0.        , 0.55847784, 0.55847784,
        0.        , 0.43370786, 0.        , 0.        ],
       [0.        , 0.43370786, 0.        , 0.        , 0.        ,
        0.55847784, 0.43370786, 0.        , 0.55847784],
       [0.50238645, 0.44507629, 0.50238645, 0.19103892, 0.19103892,
        0.19103892, 0.29671753, 0.25119322, 0.19103892]])

In [None]:
import re

def preprocessor(text):
  text = re.sub('<[^>]*>', '', text)
  emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
  text = (re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', ''))
  return text

df['review'] = df['review'].apply(preprocessor)
df.to_csv('/content/drive/MyDrive/MLIA/Data/movie_data_cleaned.csv', index=False, encoding='utf-8')

In [None]:
def tokenizer(txt):
  return txt.split()

tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [None]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()
def tokenizer_porter(text):
  return [porter.stem(word) for word in text.split()]

tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from nltk.corpus import stopwords

stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

In [None]:
print(df.shape)
x_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
x_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

(50000, 2)


In [None]:
from sklearn.pipeline import Pipeline 
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)
param_grid = [{
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]

lr_tfidf = Pipeline([('vect', tfidf), ('clf', LogisticRegression(random_state=0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)

if 'TRAVIS' in os.environ:
    gs_lr_tfidf.verbose=2
    X_train = df.loc[:250, 'review'].values
    y_train = df.loc[:250, 'sentiment'].values
    X_test = df.loc[25000:25250, 'review'].values
    y_test = df.loc[25000:25250, 'sentiment'].values

gs_lr_tfidf.fit(x_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


30 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py", line 449, in _check_solv

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect',
                                        TfidfVectorizer(lowercase=False)),
                                       ('clf',
                                        LogisticRegression(random_state=0))]),
             n_jobs=-1,
             param_grid=[{'clf__C': [1.0, 10.0, 100.0],
                          'clf__penalty': ['l1', 'l2']},
                         {'clf__C': [1.0, 10.0, 100.0],
                          'clf__penalty': ['l1', 'l2'], 'vect__norm': [None],
                          'vect__use_idf': [False]}],
             scoring='accuracy', verbose=1)

In [None]:
print(gs_lr_tfidf.best_params_)
print(gs_lr_tfidf.best_score_)

clf = gs_lr_tfidf.best_estimator_
print(clf.score(x_test, y_test))

{'clf__C': 10.0, 'clf__penalty': 'l2'}
0.892484423115377
0.89596


In [None]:
import numpy as np
import re
from nltk.corpus import stopwords

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

def stream_docs(path):
  with open(path, 'r', encoding='utf-8') as csv:
    next(csv)
    for line in csv:
      text, label = line[:-3], int(line[-2])
      yield text, label

def get_minibatch(doc_stream, size):
  docs, y = [], []
  try:
    for _ in range (size):
      text, label = next(doc_stream)
      docs.append(text)
      y.append(label)
  except StopIteration:
    return None, None
  return docs, y

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer)

clf = SGDClassifier(loss='log', random_state=1, max_iter=1)
doc_stream = stream_docs(path='/content/drive/MyDrive/MLIA/Data/movie_data_cleaned.csv')

In [None]:
import pyprind
pbar = pyprind.ProgBar(45)

classes = np.array([0, 1])
for _ in range(45):
  x_train, y_train = get_minibatch(doc_stream, size=1000)
  if not x_train:
    break
  x_train = vect.transform(x_train)
  clf.partial_fit(x_train, y_train, classes=classes)
  pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:22


In [None]:
x_test, y_test = get_minibatch(doc_stream, size=5000)
x_test = vect.transform(x_test)
print(clf.score(x_test, y_test))

0.8774


In [None]:
clf = clf.partial_fit(x_test, y_test)

In [None]:
from pickle import dump
dump(clf, open('/content/drive/MyDrive/MLIA/Model/text_model.sav', 'wb'))

In [None]:
import pickle
import os

dest = os.path.join('/content/drive/MyDrive/MLIA/Model', 'movieclassifier', 'pll_objects')
if not os.path.exists(dest):
  os.makedirs(dest)

pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'), protocol=4)
pickle.dump(clf, open('/content/drive/MyDrive/MLIA/Model/movieclassifier/pll_objects/classifier.pkl', 'wb'), protocol=4)

# 主题建模

In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/MLIA/Data/movie_data_cleaned.csv')
df.head()

Unnamed: 0,review,sentiment
0,i am surprised that there is confusion over th...,1
1,had i known to what i was submitting myself i ...,0
2,i didn t enjoy this movie at all for one i jus...,0
3,as i ve noticed with a lot of imdb comments ce...,1
4,this engaging which it shouldn t be low grade ...,0


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

#把要考虑单词的最大文档频率设置为10%（max_df=.1）,以排除在文档间频繁出现的那些单词（大概率与主题无关）
#把要考虑单词数量限制为最常出现的5000个单词，以限制数据集的维度，加快LDA的推理速度
count = CountVectorizer(stop_words='english', max_df=.1, max_features=5000)
x = count.fit_transform(df['review'].values)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

#通过设置参数learning_method='batch',让lda评估器在一次迭代中根据所有可用的训练数据进行估计，慢但预测结果更准确
#若设置参数learning_method='online'，快但预期结果准确度欠缺
lda = LatentDirichletAllocation(n_components=10, random_state=123, learning_method='batch')
x_topics = lda.fit_transform(x)

In [None]:
lda.components_.shape

(10, 5000)

In [None]:
n_top_words = 5
feature_names = count.get_feature_names()

for topic_idx, topic in enumerate(lda.components_):
  print("Topic %d:" % (topic_idx + 1))
  print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words -1:-1]]))

Topic 1:
horror original comedy black house
Topic 2:
worst minutes script guy money
Topic 3:
book dvd read version original
Topic 4:
family performance father mother beautiful
Topic 5:
series episode tv comedy kids
Topic 6:
murder police wife john crime
Topic 7:
documentary camera audience effects human
Topic 8:
music song songs musical dance
Topic 9:
effects horror guy budget dead
Topic 10:
action game war fight american




In [None]:
n_top_words = 5
feature_names = count.get_feature_names()

for topic_idx, topic in enumerate(lda.components_):
    print("Topic %d:" % (topic_idx + 1))
    print(" ".join([feature_names[i]
                    for i in topic.argsort()\
                        [:-n_top_words - 1:-1]]))

Topic 1:
horror original comedy black house
Topic 2:
worst minutes script guy money
Topic 3:
book dvd read version original
Topic 4:
family performance father mother beautiful
Topic 5:
series episode tv comedy kids
Topic 6:
murder police wife john crime
Topic 7:
documentary camera audience effects human
Topic 8:
music song songs musical dance
Topic 9:
effects horror guy budget dead
Topic 10:
action game war fight american


# Internet Application

In [None]:
%cd /content/drive/MyDrive/MLIA/Model/movieclassifier 
!touch vectorizer.py

/content/drive/MyDrive/MLIA/Model/movieclassifier


In [None]:
import os

print(os.getcwd())


/content/drive/MyDrive/MLIA/Model/movieclassifier


In [None]:
os.chdir('/content/drive/MyDrive/MLIA/Model/movieclassifier')
from vectorizer import vect
import pickle
import re
import os

clf = pickle.load(open('/content/drive/MyDrive/MLIA/Model/movieclassifier/pkl_objects/classifier.pkl', 'rb'))

In [None]:
import numpy as np

label = {0:'negative', 1:'positive'}

example = ['I love this movie']
x = vect.transform(example)
label[clf.predict(x)[0]]
np.max(clf.predict_proba(x))*100
clf.predict(x)

array([1])

In [None]:
import sqlite3
import os

os.getcwd() #make sure in the movieclassifier directory
if os.path.exists('reviews.sqlite'):
  os.remove('reviews.sqlite')

conn = sqlite3.connect('reviews.sqlite')
c = conn.cursor()
c.execute('CREATE TABLE review_db (review TEXT, sentiment INTEGER, date TEXT)')

example1 = 'i love this movie'
c.execute("INSERT INTO review_db (review, sentiment, date) VALUES(?, ?, DATETIME('now'))", (example1, 1))

example2 = 'i disliked this movie'
c.execute("INSERT INTO review_db (review, sentiment, date) VALUES(?, ?, DATETIME('now'))", (example2, 0))

conn.commit()
conn.close()

In [None]:
conn = sqlite3.connect('reviews.sqlite')
c = conn.cursor()

c.execute("SELECT * FROM review_db WHERE date BETWEEN '2017-01-01 10:10:10' AND DATETIME('now')")
result = c.fetchall()
conn.close()

In [None]:
print(result)

[('i love this movie', 1, '2022-11-09 02:31:17'), ('i disliked this movie', 0, '2022-11-09 02:31:17'), ('actually this movie is awesome, i really like this movie', 1, '2022-11-09 02:32:06')]


In [None]:
os.chdir('/content/drive/MyDrive/MLIA/lst_flask_app_1')
os.getcwd()

'/content/drive/MyDrive/MLIA/lst_flask_app_1'

In [None]:
#运行flask
!pip install flask-ngrok
!pip install pyngrok==4.1.1
!ngrok authtoken "2HGRIuzdK7sppxRO4LeZNqn6624_3oBXPeCLB1ZD9Tc9dGXnT"

Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml


In [None]:
!python app.py

 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
 * Running on http://179e-35-201-197-140.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040
127.0.0.1 - - [08/Nov/2022 12:44:18] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [08/Nov/2022 12:44:18] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
^C


## 表单验证与渲染

In [None]:
!pip install wtforms
!pip install flask_ngrok
!pip install pyngrok==4.1.1
!ngrok authtoken "2HGRIuzdK7sppxRO4LeZNqn6624_3oBXPeCLB1ZD9Tc9dGXnT"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyngrok==4.1.1
  Downloading pyngrok-4.1.1.tar.gz (18 kB)
Building wheels for collected packages: pyngrok
  Building wheel for pyngrok (setup.py) ... [?25l[?25hdone
  Created wheel for pyngrok: filename=pyngrok-4.1.1-py3-none-any.whl size=15983 sha256=f1fcce31ae462ce89356c0b12146ee68ab6f53688ba8976a19d85d8fcc9cba20
  Stored in directory: /root/.cache/pip/wheels/b1/d9/12/045a042fee3127dc40ba6f5df2798aa2df38c414bf533ca765
Successfully built pyngrok
Installing collected packages: pyngrok
Successfully installed pyngrok-4.1.1
Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml


In [None]:
import os
os.chdir('/content/drive/MyDrive/MLIA/lst_flask_app_2')

In [None]:
!python app.py

 * Serving Flask app "app" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off
 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
 * Running on http://f135-34-80-208-57.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040
127.0.0.1 - - [09/Nov/2022 02:36:13] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [09/Nov/2022 02:36:13] "[37mGET /static/style.css HTTP/1.1[0m" 200 -
127.0.0.1 - - [09/Nov/2022 02:36:14] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
127.0.0.1 - - [09/Nov/2022 02:36:54] "[37mPOST /results HTTP/1.1[0m" 200 -
127.0.0.1 - - [09/Nov/2022 02:37:10] "[37mPOST /thanks HTTP/1.1[0m" 200 -
^C


In [None]:
!python --version

Python 3.7.15
