## Obtaining Product Reviews

In [1]:
import pandas as pd
df = pd.read_csv('reviews.csv')

In [2]:
print('Number of reviews: ', df.count())

Number of reviews:  id             445
profileName    445
text           445
date           445
title          445
rating         445
images          42
dtype: int64


In [3]:
df.head(3)

Unnamed: 0,id,profileName,text,date,title,rating,images
0,R13Z1BSD70DMKJ,Dan,\n Have had this for one day and the cover is...,"Reviewed in the United States on July 4, 2019",Great inner content! Not that great outer qual...,4,
1,R6WKS7YWOKMBL,kdfuser,\n Just finished the book and followed the co...,"Reviewed in the United States on August 15, 2019",Very enjoyable read,5,https://images-na.ssl-images-amazon.com/images...
2,R2MCPJMRB3G23,Marcel Dupasquier,\n For all who want to compare the 1st to the...,"Reviewed in the United States on May 23, 2019",The updated preface,5,https://images-na.ssl-images-amazon.com/images...


## Cleansing the data

In [11]:
# Used a different translator that was able to work
from googletrans import Translator

In [12]:
detector = Translator()
detection = detector.detect('Good')
print(detection)

Detected(lang=en, confidence=None)


In [8]:
#pip install git+https://github.com/BoseCorp/py-googletrans.git --upgrade

## Removing Non-English Reviews

In [18]:
# Added a timeout value due to the original run having a connection timeout
# code is a little different fro the book
translator = Translator(timeout=10) 
detector = Translator()
df['lang'] = df['title'].apply(lambda x: detector.detect(x).lang)

In [19]:
print(df[['title','rating','lang']])

                                                 title  rating   lang
0    Great inner content! Not that great outer qual...       4     en
1                                  Very enjoyable read       5     en
2                                  The updated preface       5     en
3    Good for beginner but does not go too far or deep       4     en
4                                   Worth Every Penny!       5     en
..                                                 ...     ...    ...
440                                            Not bad       1     en
441                                               Good       5     en
442                                              Super       5     en
443                                      内容はとても良い、作りは×       4     ja
444                                               非常实用       5  zh-CN

[445 rows x 3 columns]


In [21]:
df = df[df["lang"]=='en']

In [22]:
df.head(5)

Unnamed: 0,id,profileName,text,date,title,rating,images,lang
0,R13Z1BSD70DMKJ,Dan,\n Have had this for one day and the cover is...,"Reviewed in the United States on July 4, 2019",Great inner content! Not that great outer qual...,4,,en
1,R6WKS7YWOKMBL,kdfuser,\n Just finished the book and followed the co...,"Reviewed in the United States on August 15, 2019",Very enjoyable read,5,https://images-na.ssl-images-amazon.com/images...,en
2,R2MCPJMRB3G23,Marcel Dupasquier,\n For all who want to compare the 1st to the...,"Reviewed in the United States on May 23, 2019",The updated preface,5,https://images-na.ssl-images-amazon.com/images...,en
3,R17D6VX0MH3SZY,Dear MR.J,\n What is good about it? It teaches you the ...,"Reviewed in the United States on June 29, 2019",Good for beginner but does not go too far or deep,4,,en
4,R20RZ5QNLXDI9T,CxegCfiJjXRfuN9,\n Let me preface this review by saying that ...,"Reviewed in the United States on February 9, 2020",Worth Every Penny!,5,,en


## Splitting and transforming the data

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [26]:
reviews = df['title'].values
ratings = df['rating'].values
reviews_train, reviews_test, y_train, y_test = train_test_split(reviews,
                ratings, test_size=0.2, random_state=1000)
vectorizer=CountVectorizer()
vectorizer.fit(reviews_train)
x_train = vectorizer.transform(reviews_train)
x_test = vectorizer.transform(reviews_test)

In [28]:
print(len(x_train.toarray()))

324


In [29]:
print(len(x_test.toarray()))

81


In [30]:
print(len(x_train.toarray()[0]))

441


In [31]:
print(x_train.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


## Training the model

In [32]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(x_train, y_train)

## Evaluating the model

In [39]:
import numpy as np
predicted = classifier.predict(x_test)
accuracy1 = classifier.score(x_test, y_test)

In [40]:
print(accuracy1)

0.654320987654321


In [42]:
# Could have used the score value from the model
accuracy = np.mean(predicted == y_test)
print(accuracy)

0.654320987654321


In [43]:
from sklearn import metrics
print(metrics.confusion_matrix(y_test, predicted, labels = [1,2,3,4,5]))

[[ 1  0  0  0  5]
 [ 1  0  1  1  1]
 [ 1  0  0  1  6]
 [ 0  0  1  0  7]
 [ 0  0  0  3 52]]


In [44]:
print(df.groupby("rating").size())

rating
1     26
2     15
3     23
4     51
5    290
dtype: int64


In [46]:
print(metrics.classification_report(y_test, predicted, labels=[1,2,3,4,5]))

              precision    recall  f1-score   support

           1       0.33      0.17      0.22         6
           2       0.00      0.00      0.00         4
           3       0.00      0.00      0.00         8
           4       0.00      0.00      0.00         8
           5       0.73      0.95      0.83        55

    accuracy                           0.65        81
   macro avg       0.21      0.22      0.21        81
weighted avg       0.52      0.65      0.58        81



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Predicting Stock trends

In [68]:
import yfinance as yf
tkr = yf.Ticker("AAPL")
hist = tkr.history(period='1y')


In [69]:
# if the date has minutes and other pieces, run this to get to a basic date
hist.index = hist.index.tz_localize(None)

In [70]:
import pandas_datareader.data as pdr
from datetime import date, timedelta

In [71]:
end = date.today()
start = end - timedelta(days=365)
index_data = pdr.get_data_stooq('^SPX', start, end)

In [72]:
hist.head(3)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2024-02-08,188.466692,188.615955,186.436644,187.401917,40962000,0.0,0.0
2024-02-09,187.969853,189.305033,187.322202,188.169144,45155200,0.24,0.0
2024-02-12,187.740691,187.98979,186.116563,186.475266,41781900,0.0,0.0


In [73]:
index_data.head(3)

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-02-07,6083.13,6101.28,6019.96,6025.99,2936193348
2025-02-06,6072.22,6084.03,6046.83,6083.57,3123712582
2025-02-05,6020.45,6062.86,6007.06,6061.48,3180982517


In [74]:
df = hist.join(index_data, rsuffix='_idx')

In [75]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Open_idx,High_idx,Low_idx,Close_idx,Volume_idx
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2024-02-08,188.466692,188.615955,186.436644,187.401917,40962000,0.0,0.0,4995.16,5000.4,4987.09,4997.91,2439217345
2024-02-09,187.969853,189.305033,187.322202,188.169144,45155200,0.24,0.0,5004.17,5030.06,5000.34,5026.61,2314039033
2024-02-12,187.740691,187.98979,186.116563,186.475266,41781900,0.0,0.0,5026.83,5048.39,5016.83,5021.84,2243585026
2024-02-13,185.100243,185.538659,182.848381,184.372864,56529500,0.0,0.0,4967.94,4971.3,4920.31,4953.17,2596243811
2024-02-14,184.651864,184.861098,181.782242,183.486069,54630500,0.0,0.0,4976.44,5002.52,4956.45,5000.62,2324738755


In [76]:
df = df[['Close','Volume','Close_idx','Volume_idx']]

In [77]:
df.head(3)

Unnamed: 0_level_0,Close,Volume,Close_idx,Volume_idx
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-02-08,187.401917,40962000,4997.91,2439217345
2024-02-09,188.169144,45155200,5026.61,2314039033
2024-02-12,186.475266,41781900,5021.84,2243585026


## Deriving Features from Continuous Data

In [78]:
df['priceRise'] = np.log(df['Close'] / df['Close'].shift(1))
df['volumeRise'] = np.log(df['Volume'] / df['Volume'].shift(1))
df['priceRise_idx'] = np.log(df['Close_idx'] / df['Close_idx'].shift(1))
df['volumeRise_idx'] = np.log(df['Volume_idx'] / df['Volume_idx'].shift(1))
df = df.dropna()

In [79]:
df = df[['priceRise','volumeRise','priceRise_idx','volumeRise_idx']]

In [80]:
df.head(3)

Unnamed: 0_level_0,priceRise,volumeRise,priceRise_idx,volumeRise_idx
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-02-09,0.004086,0.097461,0.005726,-0.052683
2024-02-12,-0.009043,-0.077642,-0.000949,-0.030919
2024-02-13,-0.011338,0.302299,-0.013769,0.145991


## Generating the output variable

In [81]:
conditions = [
(df['priceRise'].shift(-1) > 0.01),
(df['priceRise'].shift(-1)< -0.01)
]
choices = [1,-1]
df['Pred'] = np.select(conditions, choices, default=0)

In [82]:
df.head(3)

Unnamed: 0_level_0,priceRise,volumeRise,priceRise_idx,volumeRise_idx,Pred
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-02-09,0.004086,0.097461,0.005726,-0.052683,0
2024-02-12,-0.009043,-0.077642,-0.000949,-0.030919,-1
2024-02-13,-0.011338,0.302299,-0.013769,0.145991,0


## Training and evaluating the model

In [83]:
features = df[['priceRise','volumeRise','priceRise_idx','volumeRise_idx']].to_numpy()
features = np.around(features, decimals=2)
target = df['Pred'].to_numpy()

In [84]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

rows_train, rows_test, y_train, y_test = train_test_split(features, target, test_size=0.2)
clf = LogisticRegression()
clf.fit(rows_train, y_train)

In [85]:
print(clf.score(rows_test, y_test))

0.66
