In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

In [47]:
df=pd.read_csv("superstore.csv")
tempdf=df

Removing Miscellaneous(Unwanted) columns:

In [48]:
print(tempdf.columns)
tempdf.pop("Unnamed: 0")
tempdf.pop("记录数")
tempdf.pop("Row.ID")
tempdf.pop('Region')

Index(['Unnamed: 0', 'Category', 'City', 'Country', 'Customer.ID',
       'Customer.Name', 'Discount', 'Market', '记录数', 'Order.Date', 'Order.ID',
       'Order.Priority', 'Product.ID', 'Product.Name', 'Profit', 'Quantity',
       'Region', 'Row.ID', 'Sales', 'Segment', 'Ship.Date', 'Ship.Mode',
       'Shipping.Cost', 'State', 'Sub.Category', 'Year', 'Market2', 'weeknum'],
      dtype='object')


0        West
1        West
2        West
3        West
4        West
         ... 
51285    West
51286    West
51287    West
51288    West
51289    West
Name: Region, Length: 51290, dtype: object

Rearraging the columns for convenience

In [49]:
columns = [
    # Order Details
    "Order.ID", "Order.Date", "weeknum", "Year", "Ship.Date", "Order.Priority", "Ship.Mode",    
    # Customer Details
    "Customer.ID", "Customer.Name", "Segment",    
    # Location Details
    "City", "State", "Country",   
    # Product Details
    "Product.ID", "Product.Name", "Category", "Sub.Category",    
    # Sales and Financials
    "Quantity", "Sales", "Discount", "Profit",    
    # Market Details
    "Market", "Market2"
]
tempdf=tempdf[columns]
tempdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51290 entries, 0 to 51289
Data columns (total 23 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Order.ID        50986 non-null  object 
 1   Order.Date      51085 non-null  object 
 2   weeknum         51290 non-null  int64  
 3   Year            51178 non-null  float64
 4   Ship.Date       50917 non-null  object 
 5   Order.Priority  51157 non-null  object 
 6   Ship.Mode       50822 non-null  object 
 7   Customer.ID     51064 non-null  object 
 8   Customer.Name   50864 non-null  object 
 9   Segment         50863 non-null  object 
 10  City            51044 non-null  object 
 11  State           50875 non-null  object 
 12  Country         50883 non-null  object 
 13  Product.ID      51000 non-null  object 
 14  Product.Name    51259 non-null  object 
 15  Category        51171 non-null  object 
 16  Sub.Category    51014 non-null  object 
 17  Quantity        51142 non-null 

Cleaning the dataset:

In [63]:
for i in tempdf.columns:
    if(tempdf[i].dtypes=='object'):
        tempdf[i]=tempdf[i].astype("string")
tempdf['Order.Date']=pd.to_datetime(tempdf['Order.Date'])
tempdf['Ship.Date']=pd.to_datetime(tempdf['Ship.Date'])
tempdf['Year']=pd.to_numeric(tempdf['Year'])

#setting the dataframe index to show the row number
tempdf=tempdf.sort_values(by='Order.Date', ascending=False)
tempdf.reset_index()
tempdf.set_index(np.arange(0,len(tempdf)), inplace=True)

In [51]:
print(len(tempdf))
tempdf = tempdf[tempdf['Ship.Date'] >= tempdf['Order.Date']]
print(df.duplicated().sum())
print(len(tempdf))

51290


0
50715


In [52]:
#filling up the Market and Market2 columns
M1=tempdf["Market"].dropna().unique().tolist()
M2=tempdf["Market2"].dropna().unique().tolist()
M2.append("North America")
#print(M1,"\n", M2)

for i in range(len(M1)): 
    tempdf.loc[tempdf["Market"].isna() & (tempdf["Market2"]==M2[i]), "Market"] = M1[i]
    tempdf.loc[tempdf["Market2"].isna() & (tempdf["Market"]==M1[i]), "Market2"] = M2[i]

Cleaning Location related information

In [53]:
c=pd.read_csv("cities.csv")
city_state_country=c.loc[:,["name", "state_name", "country_name"]]

In [54]:
tempdf.loc[:,['City','State','Country']].nunique

print(len(tempdf[tempdf["Country"].isna()]))
print(len(tempdf[tempdf["State"].isna()]))
print(len(tempdf[tempdf["City"].isna()]))

# Creating a mapping dictionary 
state_to_country = dict(zip(city_state_country['state_name'], city_state_country['country_name']))
city_to_country = dict(zip(city_state_country['name'], city_state_country['country_name']))
city_to_state = dict(zip(city_state_country['state_name'], city_state_country['name']))

# mapping using np.where
tempdf['Country'] = np.where(tempdf['Country'].isna(), tempdf['State'].map(state_to_country), tempdf['Country'])
tempdf['Country'] = np.where(tempdf['Country'].isna(), tempdf['City'].map(city_to_country), tempdf['Country'])

tempdf['State'] = np.where(tempdf['State'].isna(), tempdf['City'].map(city_to_state), tempdf['State'])

print(len(tempdf[tempdf["Country"].isna()]))
print(len(tempdf[tempdf["State"].isna()]))
print(len(tempdf[tempdf["City"].isna()]))

403
412
244
26
348
244


In [55]:
tempdf['Year'] = np.where(tempdf['Year'].isna(), pd.DatetimeIndex(tempdf['Order.Date']).year, tempdf['Year'])

In [None]:
before=len(tempdf)
tempdf=tempdf.dropna()
after=len(tempdf)

print(((before-after)/before)*100)

print(after)
print(tempdf.isna().sum())

0.0
46483
Order.ID          0
Order.Date        0
weeknum           0
Year              0
Ship.Date         0
Order.Priority    0
Ship.Mode         0
Customer.ID       0
Customer.Name     0
Segment           0
City              0
State             0
Country           0
Product.ID        0
Product.Name      0
Category          0
Sub.Category      0
Quantity          0
Sales             0
Discount          0
Profit            0
Market            0
Market2           0
avgdate           0
dtype: int64


In [57]:
freq_table = pd.crosstab(tempdf['Year'], tempdf['Segment'], margins=True, dropna=True, normalize=True) 
round(freq_table*100, 2)
#(freq_table/len(tempdf.dropna()))*100

Segment,Consumer,Corporate,Home Office,All
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011.0,9.16,5.35,3.04,17.54
2012.0,11.27,6.37,3.72,21.37
2013.0,13.87,8.19,4.87,26.93
2014.0,17.43,10.14,6.59,34.16
All,51.73,30.06,18.22,100.0


In [58]:
#plt.bar(tempdf['Year'], tempdf['Sales'])

In [59]:
avgdate=tempdf['Ship.Date']-tempdf['Order.Date']
tempdf['avgdate']=avgdate