In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from scipy import stats as s
from sklearn.model_selection import train_test_split
import statistics

In [2]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
historical_transactions =pd.read_csv('C:/Users/wei/Documents/碩士/碩二/推薦系統/elo-merchant-category-recommendation/historical_transactions.csv')
merchants = pd.read_csv('C:/Users/wei/Documents/碩士/碩二/推薦系統/elo-merchant-category-recommendation/merchants.csv')
new_transactions = pd.read_csv('C:/Users/wei/Documents/碩士/碩二/推薦系統/elo-merchant-category-recommendation/new_merchant_transactions.csv')
train = pd.read_csv('C:/Users/wei/Documents/碩士/碩二/推薦系統/elo-merchant-category-recommendation/train.csv')
test= pd.read_csv('C:/Users/wei/Documents/碩士/碩二/推薦系統/elo-merchant-category-recommendation/test.csv')

In [5]:
for df in [historical_transactions, new_transactions]:
    df['purchase_date'] = pd.to_datetime(df['purchase_date'])
    df['year'] = df['purchase_date'].dt.year
    df['weekofyear'] = df['purchase_date'].dt.weekofyear
    df['month'] = df['purchase_date'].dt.month
    df['dayofweek'] = df['purchase_date'].dt.dayofweek
    df['weekend'] = (df.purchase_date.dt.weekday >=5).astype(int)
    df['hour'] = df['purchase_date'].dt.hour
    df['authorized_flag'] = df['authorized_flag'].map({'Y':1, 'N':0})
    df['category_1'] = df['category_1'].map({'Y':1, 'N':0}) 
    df['month_diff'] = ((datetime.datetime.today() - df['purchase_date']).dt.days)//30
    df['month_diff'] += df['month_lag']

  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


In [6]:
historical_transactions = reduce_mem_usage(historical_transactions)
new_transactions = reduce_mem_usage(new_transactions)

Mem. usage decreased to 1582.53 Mb (65.2% reduction)
Mem. usage decreased to 102.97 Mb (66.5% reduction)


In [7]:
historical_transactions = pd.get_dummies(historical_transactions, columns=['category_2', 'category_3'])
new_transactions = pd.get_dummies(new_transactions, columns=['category_2', 'category_3'])

In [8]:
def aggregate_transactions(history):
    
    history.loc[:, 'purchase_date'] = pd.DatetimeIndex(history['purchase_date']).\
                                      astype(np.int64) * 1e-9
    
    agg_func = {
    'authorized_flag': ['mean'],
    'category_1': ['sum', 'mean'],
    'category_2_1.0': ['mean'],
    'category_2_2.0': ['mean'],
    'category_2_3.0': ['mean'],
    'category_2_4.0': ['mean'],
    'category_2_5.0': ['mean'],
    'category_3_A': ['mean'],
    'category_3_B': ['mean'],
    'category_3_C': ['mean'],
    'merchant_id': ['nunique'],
    'merchant_category_id': ['nunique'],
    'state_id': ['nunique'],
    'city_id': ['nunique'],
    'subsector_id': ['nunique'],
    'purchase_amount': ['sum', 'mean', 'max', 'min', 'std'],
    'installments': ['sum', 'mean', 'max', 'min', 'std'],
    'purchase_date': [np.ptp, 'min', 'max'],
    'month_lag': ['mean', 'max', 'min', 'std'],
    'month_diff': ['mean'],
    'month': ['nunique'],
    'hour': ['nunique'],
    'weekofyear': ['nunique'],
    'dayofweek': ['nunique'],
    'year': ['nunique'],
    'authorized_flag': ['sum', 'mean'],
    'weekend': ['sum', 'mean']
    }
    
    agg_history = history.groupby(['card_id']).agg(agg_func)
    agg_history.columns = ['_'.join(col).strip() for col in agg_history.columns.values]
    agg_history.reset_index(inplace=True)
    
    df = (history.groupby('card_id')
          .size()
          .reset_index(name='transactions_count'))
    
    agg_history = pd.merge(df, agg_history, on='card_id', how='left')
    
    return agg_history

In [9]:
history = aggregate_transactions(historical_transactions)
history.columns = ['hist_' + c if c != 'card_id' else c for c in history.columns]
history[:5]

Unnamed: 0,card_id,hist_transactions_count,hist_authorized_flag_sum,hist_authorized_flag_mean,hist_category_1_sum,hist_category_1_mean,hist_category_2_1.0_mean,hist_category_2_2.0_mean,hist_category_2_3.0_mean,hist_category_2_4.0_mean,...,hist_month_lag_min,hist_month_lag_std,hist_month_diff_mean,hist_month_nunique,hist_hour_nunique,hist_weekofyear_nunique,hist_dayofweek_nunique,hist_year_nunique,hist_weekend_sum,hist_weekend_mean
0,C_ID_00007093c1,149,114.0,0.765101,28.0,0.187919,0.0,0.0,0.805369,0.0,...,-12,3.453114,47.402685,12,18,39,7,2,25.0,0.167785
1,C_ID_0001238066,123,120.0,0.97561,2.0,0.01626,0.772358,0.0,0.0,0.0,...,-5,1.28898,47.252033,6,20,23,7,2,52.0,0.422764
2,C_ID_0001506ef0,66,62.0,0.939394,0.0,0.0,0.030303,0.0,0.969697,0.0,...,-13,4.2375,47.378788,11,15,24,7,2,32.0,0.484848
3,C_ID_0001793786,216,189.0,0.875,2.0,0.009259,0.050926,0.351852,0.069444,0.0,...,-9,2.306373,51.259259,10,21,33,7,1,37.0,0.171296
4,C_ID_000183fdda,144,137.0,0.951389,4.0,0.027778,0.048611,0.006944,0.909722,0.0,...,-6,1.895264,47.326389,7,19,27,7,2,33.0,0.229167


In [10]:
new = aggregate_transactions(new_transactions)
new.columns = ['new_' + c if c != 'card_id' else c for c in new.columns]
new[:5]

Unnamed: 0,card_id,new_transactions_count,new_authorized_flag_sum,new_authorized_flag_mean,new_category_1_sum,new_category_1_mean,new_category_2_1.0_mean,new_category_2_2.0_mean,new_category_2_3.0_mean,new_category_2_4.0_mean,...,new_month_lag_min,new_month_lag_std,new_month_diff_mean,new_month_nunique,new_hour_nunique,new_weekofyear_nunique,new_dayofweek_nunique,new_year_nunique,new_weekend_sum,new_weekend_mean
0,C_ID_00007093c1,2,2,1,0,0.0,0.5,0.0,0.5,0.0,...,2,0.0,47.5,1,2,2,2,1,0,0.0
1,C_ID_0001238066,26,26,1,2,0.076923,0.769231,0.0,0.0,0.0,...,1,0.485165,47.153846,2,16,9,6,1,12,0.461538
2,C_ID_0001506ef0,2,2,1,0,0.0,0.0,0.0,1.0,0.0,...,1,0.0,47.0,1,2,2,2,1,0,0.0
3,C_ID_0001793786,31,31,1,0,0.0,0.483871,0.258065,0.16129,0.0,...,1,0.475191,51.0,2,10,6,6,1,14,0.451613
4,C_ID_000183fdda,11,11,1,0,0.0,0.0,0.0,1.0,0.0,...,1,0.467099,47.545455,2,8,7,6,1,2,0.181818


In [11]:
train = pd.merge(train, history, on='card_id', how='left')
test = pd.merge(test, history, on='card_id', how='left')

train = pd.merge(train, new, on='card_id', how='left')
test = pd.merge(test, new, on='card_id', how='left')


In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 201917 entries, 0 to 201916
Data columns (total 92 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   first_active_month                 201917 non-null  object 
 1   card_id                            201917 non-null  object 
 2   feature_1                          201917 non-null  int64  
 3   feature_2                          201917 non-null  int64  
 4   feature_3                          201917 non-null  int64  
 5   target                             201917 non-null  float64
 6   hist_transactions_count            201917 non-null  int64  
 7   hist_authorized_flag_sum           201917 non-null  float64
 8   hist_authorized_flag_mean          201917 non-null  float64
 9   hist_category_1_sum                201917 non-null  float64
 10  hist_category_1_mean               201917 non-null  float64
 11  hist_category_2_1.0_mean           2019

In [13]:
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()

In [14]:
feature_cols = [col for col in train.columns if col not in ['target', 'first_active_month', 'card_id']]
X = train[feature_cols]

# impute missing values
X = my_imputer.fit_transform(X)

# get the target vector
y = train['target']

In [35]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6)

In [36]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

reg_predictions = []

In [37]:
myKNeighborsReg = KNeighborsRegressor(n_neighbors = 3)

myKNeighborsReg.fit(X_train, y_train)

y_predict_myKNeighborsReg = myKNeighborsReg.predict(X_test)

reg_predictions.append(y_predict_myKNeighborsReg)

# TODO: find and change a time stamp feature to a float

In [38]:
myDecisionTreeReg = DecisionTreeRegressor(random_state = 5)

myDecisionTreeReg.fit(X_train, y_train)

y_predict_myDecisionTreeReg = myDecisionTreeReg.predict(X_test)

reg_predictions.append(y_predict_myDecisionTreeReg)

In [39]:
myLinearReg = LinearRegression()

myLinearReg.fit(X_train, y_train)

y_predict_myLinearReg = myLinearReg.predict(X_test)

reg_predictions.append(y_predict_myLinearReg)

In [40]:
myRandomForestReg = RandomForestRegressor(n_estimators = 9, bootstrap = True, random_state = 3)

myRandomForestReg.fit(X_train, y_train)

y_predict_myRandomForestReg = myRandomForestReg.predict(X_test)

reg_predictions.append(y_predict_myRandomForestReg)

print(X.shape)

(201917, 89)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype=np.int)


In [41]:
from sklearn import metrics

for model, y_prediction in zip(['K Nearest Neighbor: ', 'Decision Tree: ', 'Linear Regression: ', 'Random Forest: '], reg_predictions):
    mse = metrics.mean_squared_error(y_test, y_prediction)
    rmse = np.sqrt(mse)
    print(model + str(rmse))

K Nearest Neighbor: 4.281660217556923
Decision Tree: 5.38598617486404
Linear Regression: 3.810687009242546
Random Forest: 3.885427623739086


In [42]:
for model, y_prediction in zip(['K Nearest Neighbor: ', 'Decision Tree: ', 'Linear Regression: ', 'Random Forest: '], reg_predictions):
    mse = metrics.mean_squared_error(y_test, y_prediction)
    print(model + str(mse))

K Nearest Neighbor: 18.3326142186096
Decision Tree: 29.008847075826566
Linear Regression: 14.5213354824099
Random Forest: 15.096547819314761


In [43]:
print('訓練集: ',myKNeighborsReg.score(X_train,y_train))
print('測試集: ',myKNeighborsReg.score(X_test,y_test))

訓練集:  0.3622714997387203
測試集:  -0.27345692924967824


In [44]:
print('R2 score: ', myKNeighborsReg.score(X_train,y_train))
mse = metrics.mean_squared_error(y_test, y_prediction)
print('MSE score: ', mse)

R2 score:  0.3622714997387203
MSE score:  15.096547819314761
RMSE score:  3.885427623739086


## PCA

In [26]:
from sklearn.decomposition import PCA
n = 45 # (n is the number of components (new features)
# after dimensionality reduction)
my_pca = PCA(n_components = n)
# (X_Train is feature matrix of training set before DR,
# X_Train_New is feature matrix of training set after DR):
X_Train_new = my_pca.fit_transform(X_train)
X_Test_new = my_pca.transform(X_test)

In [27]:
reg_predictions_new = []

In [28]:
myKNeighborsReg = KNeighborsRegressor(n_neighbors = 3)

myKNeighborsReg.fit(X_Train_new, y_train)

y_predict_myKNeighborsReg = myKNeighborsReg.predict(X_Test_new)

reg_predictions_new.append(y_predict_myKNeighborsReg)

In [29]:
print('R2 score: ', myKNeighborsReg.score(X_Train_new,y_train))
mse = metrics.mean_squared_error(y_test, y_prediction)
print('MSE score: ', mse)

R2 score:  0.36328065679622323
MSE score:  15.697200030809258


In [30]:
myDecisionTreeReg = DecisionTreeRegressor(random_state = 5)

myDecisionTreeReg.fit(X_Train_new, y_train)

y_predict_myDecisionTreeReg = myDecisionTreeReg.predict(X_Test_new)

reg_predictions_new.append(y_predict_myDecisionTreeReg)

In [31]:
myLinearReg = LinearRegression()

myLinearReg.fit(X_Train_new, y_train)

y_predict_myLinearReg = myLinearReg.predict(X_Test_new)

reg_predictions_new.append(y_predict_myLinearReg)

In [32]:
myRandomForestReg = RandomForestRegressor(n_estimators = 9, bootstrap = True, random_state = 3)

myRandomForestReg.fit(X_Train_new, y_train)

y_predict_myRandomForestReg = myRandomForestReg.predict(X_Test_new)

reg_predictions_new.append(y_predict_myRandomForestReg)

print(X.shape)

(201917, 89)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype=np.int)


In [33]:
for model, y_prediction in zip(['K Nearest Neighbor: ', 'Decision Tree: ', 'Linear Regression: ', 'Random Forest: '], reg_predictions_new):
    mse = metrics.mean_squared_error(y_test, y_prediction)
    rmse = np.sqrt(mse)
    print(model + str(rmse))

K Nearest Neighbor: 4.307557087326024
Decision Tree: 5.537317671446234
Linear Regression: 3.7755488794473293
Random Forest: 3.997754230622962


In [34]:
for model, y_prediction in zip(['K Nearest Neighbor: ', 'Decision Tree: ', 'Linear Regression: ', 'Random Forest: '], reg_predictions):
    mse = metrics.mean_squared_error(y_test, y_prediction)
    print(model + str(mse))

K Nearest Neighbor: 18.555048060572656
Decision Tree: 27.1802454740287
Linear Regression: 14.052478803084796
Random Forest: 15.697200030809258
