In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv("../../datasets/problem1/merged.csv", engine="pyarrow", usecols=["MAGIC_KEY", "PURCHASE_DATE", "MILK_PURCHAED", "MEAT_PURCHAED"])
dataset.head()

Unnamed: 0,MAGIC_KEY,PURCHASE_DATE,MILK_PURCHAED,MEAT_PURCHAED
0,2940398F26F,2018-10-01,1,1
1,2CA943107B8,2018-10-01,1,1
2,2C00FF1E2DC,2018-10-01,1,1
3,2C1A276B78F,2018-10-01,1,1
4,2BD8A103BA3,2018-10-01,1,1


In [3]:
# Divide the date into day, month and year
dataset["PURCHASE_DATE"] = pd.to_datetime(dataset["PURCHASE_DATE"])
dataset["DAY"] = dataset["PURCHASE_DATE"].dt.day
dataset["MONTH"] = dataset["PURCHASE_DATE"].dt.month
dataset["YEAR"] = dataset["PURCHASE_DATE"].dt.year

dataset

Unnamed: 0,MAGIC_KEY,PURCHASE_DATE,MILK_PURCHAED,MEAT_PURCHAED,DAY,MONTH,YEAR
0,2940398F26F,2018-10-01,1,1,1,10,2018
1,2CA943107B8,2018-10-01,1,1,1,10,2018
2,2C00FF1E2DC,2018-10-01,1,1,1,10,2018
3,2C1A276B78F,2018-10-01,1,1,1,10,2018
4,2BD8A103BA3,2018-10-01,1,1,1,10,2018
...,...,...,...,...,...,...,...
2455795,2C4D5082C67,2019-02-28,0,1,28,2,2019
2455796,2CDF2D93090,2019-02-28,0,1,28,2,2019
2455797,2C1A9F1EB39,2019-02-28,0,1,28,2,2019
2455798,2C5CC71E434,2019-02-28,0,1,28,2,2019


## Vectorization and K-Means Clustering

In [4]:
# Perform tf-idf vectorization on the MAGIC_KEY
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [5]:
NO_OF_CLUSTERS = 4

In [6]:
tf_idf_vectorizer = TfidfVectorizer(analyzer="char")

magic_key_vectorized = tf_idf_vectorizer.fit_transform(dataset["MAGIC_KEY"])

kmeans = KMeans(n_clusters=NO_OF_CLUSTERS, random_state=0).fit(magic_key_vectorized)

dataset["CLUSTER"] = kmeans.labels_

dataset

Unnamed: 0,MAGIC_KEY,PURCHASE_DATE,MILK_PURCHAED,MEAT_PURCHAED,DAY,MONTH,YEAR,CLUSTER
0,2940398F26F,2018-10-01,1,1,1,10,2018,3
1,2CA943107B8,2018-10-01,1,1,1,10,2018,3
2,2C00FF1E2DC,2018-10-01,1,1,1,10,2018,3
3,2C1A276B78F,2018-10-01,1,1,1,10,2018,1
4,2BD8A103BA3,2018-10-01,1,1,1,10,2018,1
...,...,...,...,...,...,...,...,...
2455795,2C4D5082C67,2019-02-28,0,1,28,2,2019,0
2455796,2CDF2D93090,2019-02-28,0,1,28,2,2019,3
2455797,2C1A9F1EB39,2019-02-28,0,1,28,2,2019,3
2455798,2C5CC71E434,2019-02-28,0,1,28,2,2019,3


In [7]:
dataset["CLUSTER"].value_counts()

CLUSTER
3    727016
2    596388
0    568417
1    563979
Name: count, dtype: int64

## Train Test Split

In [8]:
from sklearn.model_selection import train_test_split

### For meat purchased

In [9]:
X_train_meat, X_test_meat, y_train_meat, y_test_meat = train_test_split(
    dataset[["CLUSTER", "MONTH", "YEAR"]],
    dataset["MEAT_PURCHAED"],
    train_size=0.8,
)

### For milk purchased

In [27]:
X_train_milk, X_test_milk, y_train_milk, y_test_milk = train_test_split(
    dataset[["CLUSTER", "MONTH", "YEAR"]],
    dataset["MILK_PURCHAED"],
    train_size=0.8,
)

## Random Forest Regressor

In [11]:
from sklearn.ensemble import RandomForestRegressor

In [12]:
from sklearn.metrics import mean_squared_error, accuracy_score

### For meat purchased

In [14]:
random_forest_regressor_meat_purchased = RandomForestRegressor()

random_forest_regressor_meat_purchased.fit(X_train_meat, y_train_meat)

In [15]:
mean_squared_error(y_test_meat, random_forest_regressor_meat_purchased.predict(X_test_meat))

0.10157439807958851

In [16]:
accuracy_score(y_test_meat, random_forest_regressor_meat_purchased.predict(X_test_meat) > 0.5)


0.8847259548823194

### For milk purchased

In [28]:
random_forest_regressor_milk_purchased = RandomForestRegressor()

random_forest_regressor_milk_purchased.fit(X_train_milk, y_train_milk)

In [29]:
mean_squared_error(y_test_milk, random_forest_regressor_milk_purchased.predict(X_test_milk))

0.19989156813879924

In [30]:
accuracy_score(y_test_milk, random_forest_regressor_milk_purchased.predict(X_test_milk) > 0.5)

0.7120897467220457

In [35]:
submission_df = pd.read_csv("../../datasets/problem1/sample submission 1.csv", engine="pyarrow")

submission_df.head()

Unnamed: 0,MAGIC_KEY,PURCHASE
0,28D5BB06356,Y
1,293BEAB4E98,Y
2,2962EE8065C,Y
3,2957BE29EA9,Y
4,28E351A0745,Y


In [36]:
tf_idf_vectorizer = TfidfVectorizer(analyzer="char")

magic_key_vectorized = tf_idf_vectorizer.fit_transform(submission_df["MAGIC_KEY"])

kmeans = KMeans(n_clusters=NO_OF_CLUSTERS, random_state=0).fit(magic_key_vectorized)

submission_df["CLUSTER"] = kmeans.labels_

submission_df.head()

Unnamed: 0,MAGIC_KEY,PURCHASE,CLUSTER
0,28D5BB06356,Y,1
1,293BEAB4E98,Y,2
2,2962EE8065C,Y,2
3,2957BE29EA9,Y,2
4,28E351A0745,Y,1


In [37]:
# Add month and year to the submission dataset where month is 3 and year is 2019

submission_df["MONTH"] = 3
submission_df["YEAR"] = 2019

submission_df.head()

Unnamed: 0,MAGIC_KEY,PURCHASE,CLUSTER,MONTH,YEAR
0,28D5BB06356,Y,1,3,2019
1,293BEAB4E98,Y,2,3,2019
2,2962EE8065C,Y,2,3,2019
3,2957BE29EA9,Y,2,3,2019
4,28E351A0745,Y,1,3,2019


In [47]:
# Predict the meat and milk purchased

submission_df["MEAT_PURCHAED"] = random_forest_regressor_meat_purchased.predict(submission_df[["CLUSTER", "MONTH", "YEAR"]])
submission_df["MILK_PURCHAED"] = random_forest_regressor_milk_purchased.predict(submission_df[["CLUSTER", "MONTH", "YEAR"]])

In [48]:
submission_df.head()

Unnamed: 0,MAGIC_KEY,PURCHASE,CLUSTER,MONTH,YEAR,MEAT_PURCHAED,MILK_PURCHAED
0,28D5BB06356,Y,1,3,2019,0.925437,0.649351
1,293BEAB4E98,Y,2,3,2019,0.9231,0.672334
2,2962EE8065C,Y,2,3,2019,0.9231,0.672334
3,2957BE29EA9,Y,2,3,2019,0.9231,0.672334
4,28E351A0745,Y,1,3,2019,0.925437,0.649351


In [43]:
# If the value for meat purchased is less than 0.5, then set it to 0 else set it to 1
submission_df.loc[submission_df["MEAT_PURCHAED"] <= 0.5, "MEAT_PURCHAED"] = 0
submission_df.loc[submission_df["MEAT_PURCHAED"] > 0.5, "MEAT_PURCHAED"] = 1

# If the value for milk purchased is less than 0.5, then set it to 0 else set it to 1
submission_df.loc[submission_df["MILK_PURCHAED"] <= 0.5, "MILK_PURCHAED"] = 0
submission_df.loc[submission_df["MILK_PURCHAED"] > 0.5, "MILK_PURCHAED"] = 1

submission_df.head()

Unnamed: 0,MAGIC_KEY,PURCHASE,CLUSTER,MONTH,YEAR,MEAT_PURCHAED,MILK_PURCHAED
0,28D5BB06356,Y,1,3,2019,1.0,1.0
1,293BEAB4E98,Y,2,3,2019,1.0,1.0
2,2962EE8065C,Y,2,3,2019,1.0,1.0
3,2957BE29EA9,Y,2,3,2019,1.0,1.0
4,28E351A0745,Y,1,3,2019,1.0,1.0


In [44]:
# Set the PURCHASE as Y for the submission dataset if either meat or milk is purchased otherwise set it to N
submission_df["PURCHASE"] = "N"
submission_df.loc[(submission_df["MEAT_PURCHAED"] == 1) | (submission_df["MILK_PURCHAED"] == 1), "PURCHASE"] = "Y"

submission_df.head()

Unnamed: 0,MAGIC_KEY,PURCHASE,CLUSTER,MONTH,YEAR,MEAT_PURCHAED,MILK_PURCHAED
0,28D5BB06356,Y,1,3,2019,1.0,1.0
1,293BEAB4E98,Y,2,3,2019,1.0,1.0
2,2962EE8065C,Y,2,3,2019,1.0,1.0
3,2957BE29EA9,Y,2,3,2019,1.0,1.0
4,28E351A0745,Y,1,3,2019,1.0,1.0


In [46]:
submission_df["MEAT_PURCHAED"].value_counts()

MEAT_PURCHAED
1.0    58689
Name: count, dtype: int64

In [45]:
submission_df[["MAGIC_KEY", "PURCHASE"]].to_csv("../../datasets/problem1/export_submission.csv", index=False)

## Hyperparameter Tuning for Random Forest Regressor