In [21]:
from sqlalchemy import create_engine, inspect
import creds
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn import utils
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
import numpy as np

In [22]:
host = creds.host
password = creds.password
port = creds.port
user = creds.user
database = creds.database
api_type = creds.api_type
engine = create_engine(f'postgresql+{api_type}://{user}:{password}@{host}:{port}/{database}')
# engine.connect()

In [23]:
df = pd.read_sql_table('products', engine)
df.drop(['create_time', 'page_id'], axis=1, inplace=True)
cvec = CountVectorizer()
lab_enc = preprocessing.LabelEncoder()

In [24]:
### Cleaning up the price column in the dataframe.
# Removes all rows which have "N/A" in the price column.
df = df[df['price'] != 'N/A']
# df = df[df['product_name'] != 'N/A']
df['price'] = df['price'].str.strip('£')
df['price'] = df['price'].str.replace(',', '')
df['price'] = df['price'].astype('float64')
encoded = lab_enc.fit_transform(df['price'])

In [25]:
X = df['product_name']
y = encoded
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
print("Number of samples in:")
print(f"    Training: {len(y_train)}")
print(f"    Testing: {len(y_test)}")
# min_df=2 will remove all terms that only appear once in the entire data sheet. I assume them to be outliers or words that are never seen again.
# cvec will transform all words into lowercase by default.
cvec = CountVectorizer(min_df=2).fit(X_train)
df_product_name_train = pd.DataFrame(cvec.transform(X_train).todense(),columns=cvec.get_feature_names_out())
df_product_name_test = pd.DataFrame(cvec.transform(X_test).todense(),columns=cvec.get_feature_names_out())
print(df_product_name_train.shape)
print(y_train.shape)
print(df_product_name_test.shape)
print(y_test.shape)

Number of samples in:
    Training: 4794
    Testing: 2362
(4794, 3731)
(4794,)
(2362, 3731)
(2362,)


In [35]:
lr = LogisticRegression()
lr.fit(df_product_name_train, y_train)
lr.score(df_product_name_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.12574089754445386

In [6]:
X = df['location']
y = encoded
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
print("Number of samples in:")
print(f"    Training: {len(y_train)}")
print(f"    Testing: {len(y_test)}")
cvec = CountVectorizer(stop_words='english').fit(X_train)
df_location_train = pd.DataFrame(cvec.transform(X_train).todense(),columns=cvec.get_feature_names_out())
df_location_test = pd.DataFrame(cvec.transform(X_test).todense(),columns=cvec.get_feature_names_out())
print(df_location_train.shape)
print(y_train.shape)
print(df_location_test.shape)
print(y_test.shape)

Number of samples in:
    Training: 4794
    Testing: 2362
(4794, 1465)
(4794,)
(2362, 1465)
(2362,)


In [37]:
lr = LogisticRegression()
lr.fit(df_location_train, y_train)
lr.score(df_location_test, y_test)

0.08763759525825572

In [19]:
X = df['product_description']
y = encoded
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
print("Number of samples in:")
print(f"    Training: {len(y_train)}")
print(f"    Testing: {len(y_test)}")
# min_df=2 will remove all terms that only appear once in the entire data sheet. I assume them to be outliers or words that are never seen again.
# cvec will transform all words into lowercase by default.
cvec = CountVectorizer(min_df=2).fit(X_train)
df_product_description_train = pd.DataFrame(cvec.transform(X_train).todense(),columns=cvec.get_feature_names_out())
df_product_description_test = pd.DataFrame(cvec.transform(X_test).todense(),columns=cvec.get_feature_names_out())
print(df_product_description_train.shape)
print(y_train.shape)
print(df_product_description_test.shape)
print(y_test.shape)

Number of samples in:
    Training: 4794
    Testing: 2362
(4794, 11871)
(4794,)
(2362, 11871)
(2362,)


In [20]:
df_product_description_train

Unnamed: 0,00,000,003,0049437,005,01,0121,0141,01563,0161,...,yr,yrs,zero,zip,zipped,zips,zombie,zone,zones,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4789,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4790,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4791,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4792,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
lr = LogisticRegression()
lr.fit(df_product_description_train, y_train)
lr.score(df_product_description_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.1693480101608806

In [8]:
train = pd.concat([df_product_name_train, df_location_train, df_product_description_train], axis=1)
test = pd.concat([df_product_name_test, df_location_test, df_product_description_test], axis=1)
print(train.shape)
print(y_train.shape)
print(test.shape)
print(y_test.shape)


(4794, 30571)
(4794,)
(2362, 30571)
(2362,)


In [13]:
lr = LogisticRegression()
lr.fit(train, y_train)
# lr.score(test, y_test)
lr.predict(test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([ 36,  68, 104, ...,  36, 115,  99])

In [None]:
# np.random.seed(2)

# models = [
#     DecisionTreeRegressor(splitter="random"),
#     SVR(),
#     LinearRegression()
#         ]

# for model in models:
#     model.fit(X_train, y_train)

#     y_train_pred = model.predict(X_train)
#     y_validation_pred = model.predict(X_validation)
#     y_test_pred = model.predict(X_test)

#     train_loss = mean_squared_error(y_train, y_train_pred)
#     validation_loss = mean_squared_error(y_validation, y_validation_pred)
#     test_loss = mean_squared_error(y_test, y_test_pred)

#     print(
#         f"{model.__class__.__name__}: "
#         f"Train Loss: {train_loss} | Validation Loss: {validation_loss} | "
#         f"Test Loss: {test_loss}"
#         )