In [None]:
## Reading Files
from google.colab import drive

drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
import sklearn
import plotly.express as px
from pandas import Period
from scipy import stats
import re

In [None]:
ameco_1623 = pd.read_csv('/content/drive/MyDrive/Capstone Data/ameco_data.csv',low_memory=False)
df = ameco_1623.copy()

In [None]:
df = df[['X.','Customer.No.','Customer.Name','Customer.Category',
                        'Ship.To.State','Invoice.Date','Strikeforce.Flag','Stock.Status',
                        'Item.No.','Item.Group','Product.Classification',
                        'UoM.Code','Quantity','Inventory.Cost','Price',
                        'DPM.Factor','DPM.Price']]

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1155850 entries, 0 to 1155849
Data columns (total 17 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   X.                      1155850 non-null  object 
 1   Customer.No.            1155850 non-null  object 
 2   Customer.Name           1155835 non-null  object 
 3   Customer.Category       590461 non-null   object 
 4   Ship.To.State           571691 non-null   object 
 5   Invoice.Date            1155850 non-null  object 
 6   Strikeforce.Flag        572301 non-null   object 
 7   Stock.Status            691242 non-null   object 
 8   Item.No.                1155840 non-null  object 
 9   Item.Group              572291 non-null   object 
 10  Product.Classification  572290 non-null   object 
 11  UoM.Code                1155835 non-null  object 
 12  Quantity                1155850 non-null  object 
 13  Inventory.Cost          1155847 non-null  object 
 14  Pr

In [None]:
numerical_columns = ['Inventory.Cost','Price','Quantity']

In [None]:
df[numerical_columns] = df[numerical_columns].replace({',': ''}, regex=True).astype(float)

In [None]:
df = df[df['Price'] > 0]

## Dataset Clean

### Customer Category Remap

#### Make the Customer Number consistent

In [None]:
df['Customer.Name'] = df['Customer.Name'].str.replace(r'[^\w\s]+', '')

  df['Customer.Name'] = df['Customer.Name'].str.replace(r'[^\w\s]+', '')


In [None]:
df['Customer.Name'] = df['Customer.Name'].map(lambda x: x.strip() if isinstance(x, str) else x)

In [None]:
df['Customer.Name']

0                             BEN HUR
1                 CUSTOM CONVEYOR INC
2                 CUSTOM CONVEYOR INC
3                 CUSTOM CONVEYOR INC
18                       KENNEDY TANK
                      ...            
1155845    ADS MANUFACTURING OHIO LLC
1155846    ADS MANUFACTURING OHIO LLC
1155847    ADS MANUFACTURING OHIO LLC
1155848    ADS MANUFACTURING OHIO LLC
1155849    ADS MANUFACTURING OHIO LLC
Name: Customer.Name, Length: 1134769, dtype: object

In [None]:
def map_customer_no(row):
    if len(row['Customer.No.']) == 6 and 'PRE' in row['X.']:
        return 'C' + row['Customer.No.']
    elif len(row['Customer.No.']) == 5 and 'PRE' in row['X.']:
        return 'C0' + row['Customer.No.']
    elif len(row['Customer.No.']) == 4 and 'PRE' in row['X.']:
        return 'C00' + row['Customer.No.']
    elif len(row['Customer.No.']) == 3 and 'PRE' in row['X.']:
        return 'C000' + row['Customer.No.']
    elif len(row['Customer.No.']) == 2 and 'PRE' in row['X.']:
        return 'C0000' + row['Customer.No.']
    elif len(row['Customer.No.']) == 1 and 'PRE' in row['X.']:
        return 'C00000' + row['Customer.No.']
    else:
        return row['Customer.No.']

In [None]:
df['Customer.No.'] = df.apply(map_customer_no, axis=1)

In [None]:
print(f"The number of unique customer numberis now {df['Customer.No.'].nunique()}.")
print(f"The number of unique customer name is {df['Customer.Name'].nunique()}.")

The number of unique customer numberis now 2084.
The number of unique customer name is 2123.


In [None]:
customer_pair = df[['Customer.No.','Customer.Name']]
customer_pair = customer_pair.groupby('Customer.No.')['Customer.Name'].nunique().reset_index().rename(columns={'Customer.Name':'Count'})

In [None]:
customer_pair[customer_pair['Count']!=1]

Unnamed: 0,Customer.No.,Count
13,C000109,2
45,C000449,2
63,C000612,2
78,C000767,2
109,C001072,2
...,...,...
1681,C109882,2
1683,C109887,2
1692,C109902,2
1694,C109905,2


In [None]:
df[df['Customer.No.']=='C001072'].sample(5)

Unnamed: 0,X.,Customer.No.,Customer.Name,Customer.Category,Ship.To.State,Invoice.Date,Strikeforce.Flag,Stock.Status,Item.No.,Item.Group,Product.Classification,UoM.Code,Quantity,Inventory.Cost,Price,DPM.Factor,DPM.Price
506047,PRE 506048,C001072,APPLIED INDUSTRIAL TECHNOLOGIE,P1,,1/1/2019,,N,ING 212-TK2,,,EA,1.0,34.29,43.59,,35.2
21061,PRE 21062,C001072,APPLIED INDUSTRIAL TECHNOLOGIE,P1,,1/1/2016,,N,ENE M783055,,,EA,1.0,3.62,6.0,,
21060,PRE 21061,C001072,APPLIED INDUSTRIAL TECHNOLOGIE,P1,,1/1/2016,,N,ENE F866028,,,EA,1.0,3.62,6.0,,
91387,PRE 91388,C001072,APPLIED INDUSTRIAL TECHNOLOGIE,P1,,1/1/2016,,N,ENE RSM300K,,,EA,1.0,43.85,80.08,,88.0
223107,PRE 223108,C001072,APPLIED INDUSTRIAL TECHNOLOGIE,P1,,1/1/2017,,N,ENE RCH202K,,,EA,1.0,33.42,106.5,,71.0


#### read customer category

In [None]:
customer_df = pd.read_excel('/content/drive/MyDrive/Capstone Data/20240312 Customer Price List Assignment.xlsx',usecols=[0,1,2,3,4]).copy()

In [None]:
customer_df.sample(10)

Unnamed: 0,#,Customer No.,Customer Name,DPM Exempt,Price List Name
3483,3484,C111090,RITCHIE BROTHERS AUCTIONEERS,N,Good
2586,2587,C109977,CHERNE CONSTRUCTION WHSE #157,N,Good
1750,1751,C108912,HERITAGE INDUSTRIAL CONTR.,N,Good
1958,1959,C109240,STEAMFITTERS LOCAL 449 TECH CT,N,Good
840,841,C103797,TAYLOR CRANE & RIGGING INC,N,Good
1190,1191,C106535,WILLIAMS INDUSTRIAL SERV. LLC,N,Good
1335,1336,C107370,EGENOLF INDUSTRIAL GROUP INC.,N,Good
790,791,C103316,E&K EQUIPMENT INC,N,Good
1065,1066,C105641,ALCAR CONSTRUCTORS,N,Good
1264,1265,C107001,IRWIN INDUSTRIES,N,Good


In [None]:
customer_df['Customer No.'].nunique()

3595

In [None]:
customer_df[customer_df['Customer No.'] == 'C001072']

Unnamed: 0,#,Customer No.,Customer Name,DPM Exempt,Price List Name
128,129,C001072,APPLIED INDUSTRIAL TECHNOLOGIES,N,Good


In [None]:
customer_df['Customer No.'].unique()

array(['C000002', 'C000014', 'C000021', ..., 'C111271', 'C111272',
       'C180650'], dtype=object)

In [None]:
no_list_1 = list(customer_df['Customer No.'].unique())
no_list_2 = list(df['Customer.No.'].unique())
print(f'Length of the updated file is {len(no_list_1)}')
print(f'Length of the original file is {len(no_list_2)}')
no_list_2 in no_list_1

Length of the updated file is 3595
Length of the original file is 2084


False

In [None]:
customer_dict = dict(zip(customer_df['Customer No.'],customer_df['Customer Name']))

In [None]:
def map_customer_name(row):
  if row['Customer.No.'] in customer_dict.keys():
    return customer_dict.get(row['Customer.No.'])
  else:
    return row['Customer.Name']

In [None]:
df['customer_name'] = df.apply(map_customer_name, axis=1)

#### Map the customer category

In [None]:
category_dict = dict(zip(customer_df['Customer No.'],customer_df['Price List Name']))

In [None]:
def map_category(row):
  if row['Customer.No.'] in category_dict.keys():
    return category_dict.get(row['Customer.No.'])
  else:
    return row['Customer.Category']

In [None]:
df['customer_category'] = df.apply(map_category, axis=1)

In [None]:
df['customer_category'].value_counts()

Good            611198
Better          333080
Best            170152
Average Cost     17723
List Price        1399
M1                 464
P4                 290
P1                 268
P3                  90
P2                  66
M4                  29
P5                  10
Name: customer_category, dtype: int64

Check whether the customer belongs to a category consistently

> Unfortunately there are about 1507 has inconsistent customer categories.

In [None]:
df[(df['Customer.Category'] != df['customer_category'])&(~df['Customer.Category'].isnull())]['Customer.No.'].nunique()

1507

#### Mark AMECO sub companies

In [None]:
def mark_own_company(row):
  if row['customer_name'] in [
    "F&M MAFCO, INC",
    "CHRISTIANSTED EQUIPMENT, LTD.",
    "F&M MAFCO LLC",
    "EQUIPMENTSHARE.COM INC",
    "AMECO IC",
    "AMECO CANADA IC",
    "AMECO IC - VISTRA",
    "AMECO SERVICES INC (TRANSMOUNTAIN)"]:
    return True
  else:
    return False


In [None]:
df['Own'] = df.apply(mark_own_company,axis=1)

In [None]:
df['Own'].value_counts()

False    1102450
True       32319
Name: Own, dtype: int64

### Transform Datetime

In [None]:
## date time
df['Invoice.Date'] = pd.to_datetime(df['Invoice.Date'])
df['Year'] = df['Invoice.Date'].dt.year
df['Month'] = df['Invoice.Date'].dt.month
df['Quarter'] = df['Invoice.Date'].dt.to_period('Q')

In [None]:
def map_quarter(row):
  if row['Year'] == 2016:
    return row['Quarter']
  elif row ['Year'] == 2020:
    return Period('2019Q1', freq='Q-DEC')
  else:
    return row['Quarter'] - 4
df['Prev_Quarter'] = df.apply(map_quarter, axis=1)
df['Prev_Quarter'] = df['Prev_Quarter'].apply(lambda x: Period(x, freq='Q-DEC'))

### Flag

In [None]:
## Flag mapping
def stock_flag(row):
    if row['Stock.Status'] == 'S' or row['Stock.Status'] == 'Stock':
        return 'Y'
    elif row['Stock.Status'] == 'N' or row['Stock.Status'] == 'Non Stock':
        return 'N'
    else:
      if row['Strikeforce.Flag'] == 'Y':
        return 'Y'
      elif row['Strikeforce.Flag'] == 'N':
        return 'N'
      else:
        return 'Y'

df['Stock.Flag'] = df.apply(stock_flag, axis=1)

In [None]:
df.drop(columns={'Strikeforce.Flag','Stock.Flag'},inplace=True)

### Item group & Product Classification

In [None]:
## Item group and product classification
def fill_missing_most_frequent(group):
    mode_values = group.mode()
    if not mode_values.empty:
        mode_value = mode_values.iloc[0]
        return group.fillna(mode_value)
    else:
        return group

df['Product.Classification'] = df.groupby('Item.No.')['Product.Classification'].transform(fill_missing_most_frequent)
df['Item.Group'] = df.groupby('Item.No.')['Item.Group'].transform(fill_missing_most_frequent)

### Final Dataset

In [None]:
df.drop(columns=['X.','Customer.Name'],inplace=True)

In [None]:
df.to_csv('/content/drive/MyDrive/Capstone Data/updated_customer_dataset.csv',index=False)

In [None]:
df

Unnamed: 0,Customer.No.,Customer.Category,Ship.To.State,Invoice.Date,Stock.Status,Item.No.,Item.Group,Product.Classification,UoM.Code,Quantity,...,Price,DPM.Factor,DPM.Price,customer_name,customer_category,Own,Year,Month,Quarter,Prev_Quarter
0,C002309,P3,,2016-01-01,N,KCP 14967,,,EA,6.0,...,47.35,,41.46,BEN HUR CONSTRUCTION CO.,Better,False,2016,1,2016Q1,2016Q1
1,C000263,P3,,2016-01-01,N,KNA 977-2PK,Const Supplies CS/SD,HANBOX,PKG,1.0,...,106.26,,0,CUSTOM CONVEYOR INC,P3,False,2016,1,2016Q1,2016Q1
2,C000263,P3,,2016-01-01,S,WES FA-30,Welding Equip WS/SD,APPFLA,EA,4.0,...,69.95,,0,CUSTOM CONVEYOR INC,P3,False,2016,1,2016Q1,2016Q1
3,C000263,P3,,2016-01-01,N,PRO J07520,Const Supplies CS/SD,HANSOC,EA,3.0,...,15.25,,18.8604,CUSTOM CONVEYOR INC,P3,False,2016,1,2016Q1,2016Q1
18,C001501,P3,,2016-01-01,N,LIN ED010216,Welding Equip WS/SD,FILELM,LBS,150.0,...,2.58,,0,KENNEDY TANK,Good,False,2016,1,2016Q1,2016Q1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1155845,C109148,,OH,2021-06-16,,GEC 713SUCB-M,Const Supplies CS/SD,SAFGLO,PKG12,1.0,...,10.98,0.0,0,ADS MANUFACTURING OHIO LLC,Better,False,2021,6,2021Q2,2020Q2
1155846,C109148,,OH,2021-06-16,,BER QT3-45,Welding Equip WS/SD,MIGPAR,EA,2.0,...,49.32,0.0,0,ADS MANUFACTURING OHIO LLC,Better,False,2021,6,2021Q2,2020Q2
1155847,C109148,,OH,2021-06-16,,LAK CTL412-3X,Const Supplies CS/SD,SAFCOV,EA,25.0,...,3.40,0.0,0,ADS MANUFACTURING OHIO LLC,Better,False,2021,6,2021Q2,2020Q2
1155848,C109148,,OH,2021-06-16,,MIL 216326,Welding Equip WS/SD,APPPRT,EA,10.0,...,2.10,0.0,0,ADS MANUFACTURING OHIO LLC,Better,False,2021,6,2021Q2,2020Q2


### Prepare for Profit Margin Dataset

In [None]:
df.columns

Index(['Customer.No.', 'Customer.Category', 'Ship.To.State', 'Invoice.Date',
       'Stock.Status', 'Item.No.', 'Item.Group', 'Product.Classification',
       'UoM.Code', 'Quantity', 'Inventory.Cost', 'Price', 'DPM.Factor',
       'DPM.Price', 'customer_name', 'customer_category', 'Own', 'Year',
       'Month', 'Quarter', 'Prev_Quarter'],
      dtype='object')

In [None]:
profit_margin_df = df[['customer_name', 'customer_category', 'Own','Year',
                      'Month', 'Quarter', 'Prev_Quarter','Invoice.Date',
                      'Item.Group', 'Product.Classification',
                      'UoM.Code', 'Quantity', 'Inventory.Cost', 'Price',
                       'Ship.To.State', 'Stock.Status']]

In [None]:
profit_margin_df = profit_margin_df[(profit_margin_df['Year'] >=2020) &(profit_margin_df['Year'] <=2023)]

In [None]:
profit_margin_df.rename(columns={'Invoice.Date':'Date','Item.Group':'Group','Product.Classification':'Product_Class','UoM.Code':'Unit','Inventory.Cost':'Cost','Ship.To.State':'State','Stock.Status':'Stock'},inplace=True)

In [None]:
profit_margin_df['Profit_Margin'] = 100*(profit_margin_df['Price'] - profit_margin_df['Cost'])/profit_margin_df['Price']

In [None]:
profit_margin_df.to_csv('/content/drive/MyDrive/Capstone Data/pm_dataset.csv',index=False)