In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

In [5]:
retail_df = pd.read_csv(r'C:\Users\Preeth Shivani\Desktop\online_retail_II.csv')
retail_df

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.10,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom
...,...,...,...,...,...,...,...,...
1067366,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
1067367,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
1067368,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France
1067369,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680.0,France


In [11]:
# Checking the datatypes of the dataset
retail_df.dtypes

Invoice         object
StockCode       object
Description     object
Quantity         int64
InvoiceDate     object
Price          float64
Customer ID    float64
Country         object
dtype: object

In [12]:
retail_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1067371 entries, 0 to 1067370
Data columns (total 8 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   Invoice      1067371 non-null  object 
 1   StockCode    1067371 non-null  object 
 2   Description  1062989 non-null  object 
 3   Quantity     1067371 non-null  int64  
 4   InvoiceDate  1067371 non-null  object 
 5   Price        1067371 non-null  float64
 6   Customer ID  824364 non-null   float64
 7   Country      1067371 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 65.1+ MB


In [9]:
# Identifying Null values
retail_df.isnull().sum()


Invoice             0
StockCode           0
Description      4382
Quantity            0
InvoiceDate         0
Price               0
Customer ID    243007
Country             0
dtype: int64

In [13]:
#Its evident that 'Description' and 'Customer ID' has NULL values

retail_df['Description'].fillna('Missing', inplace=True)  # Filling the NULL values in 'Description' with 'Missing'
retail_df['Customer ID'].fillna('00000', inplace=True) # Filling the NULL values in 'Customer ID' with '00000'

In [14]:
#After removing the NULL values
retail_df.isnull().sum()

Invoice        0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
Price          0
Customer ID    0
Country        0
dtype: int64

# Modifying Features

In [16]:
# Converting 'InvoiceDate' to date values
retail_df['InvoiceDate'] = pd.to_datetime(retail_df['InvoiceDate'])
retail_df.dtypes

Invoice                object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
Price                 float64
Customer ID            object
Country                object
dtype: object

In [17]:
retail_df['PurchaseYear'] = retail_df['InvoiceDate'].dt.year
retail_df['PurchaseMonth'] = retail_df['InvoiceDate'].dt.month
retail_df['PurchaseDay'] = retail_df['InvoiceDate'].dt.day

In [12]:
retail_df.columns

Index(['Invoice', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'Price', 'Customer ID', 'Country', 'PurchaseYear', 'PurchaseMonth',
       'PurchaseDay'],
      dtype='object')

In [13]:
First_purchase = retail_df['InvoiceDate'].min()
First_purchase

Timestamp('2009-12-01 07:45:00')

In [14]:
Last_purchase = retail_df['InvoiceDate'].max()
Last_purchase 

Timestamp('2011-12-09 12:50:00')

In [18]:
#Creating a target variable
retail_df['NextPurchase'] = (retail_df['InvoiceDate'] + pd.DateOffset(days=90)) <= retail_df['InvoiceDate'].max()
retail_df['NextPurchase']

0           True
1           True
2           True
3           True
4           True
           ...  
1067366    False
1067367    False
1067368    False
1067369    False
1067370    False
Name: NextPurchase, Length: 1067371, dtype: bool

In [19]:
features = ['Customer ID', 'Quantity', 'Price'] 
features

['Customer ID', 'Quantity', 'Price']

In [20]:
# Model Training
X = retail_df[features]
y = retail_df['NextPurchase']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
# Train a DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Model Evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


print(classification_report(y_test, y_pred))


retail_df1 = pd.read_csv(r'C:\Users\Preeth Shivani\Desktop\online_retail_II.csv')
retail_df1


# Prediction for new data
# Assuming 'retail_df1' is a DataFrame with the same features
retail_df1['NextPurchasePrediction'] = model.predict(retail_df1[features])


# Display the results
(retail_df1[['Customer ID', 'NextPurchasePrediction']])

Accuracy: 0.83
              precision    recall  f1-score   support

       False       0.61      0.43      0.50     41651
        True       0.87      0.93      0.90    171824

    accuracy                           0.83    213475
   macro avg       0.74      0.68      0.70    213475
weighted avg       0.82      0.83      0.82    213475



Unnamed: 0,Customer ID,NextPurchasePrediction
0,13085.0,True
1,13085.0,True
2,13085.0,True
3,13085.0,True
4,13085.0,True
...,...,...
1067366,12680.0,False
1067367,12680.0,False
1067368,12680.0,False
1067369,12680.0,False
