### This notebook aims at analyzing the content of an E-commerce database that lists purchases made by ∼4000 customers over a period of one year (from 2010/12/01 to 2011/12/09).

### This notebook includes :

  * Data Visualization and Analysis
  * Customer Segmentation using RFM Method
  * Analysis of product description for a particular segment of customers (example of segments : best cusotmers, loyal cusotmers, lost cusotmers, etc.)
  * Word Cloud for words in Product Description after cleaning (reomving unnecessay words or stop words)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import datetime as dt 
from sklearn.cluster import KMeans

In [None]:
data = pd.read_csv('/kaggle/input/ecommerce-data/data.csv', encoding = 'unicode_escape')


In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
pd.to_datetime(data.InvoiceDate.max())-pd.to_datetime(data.InvoiceDate.min()) 

Checking for duplicates now and will remove those duplicates


In [None]:
data.duplicated().sum()

In [None]:
data.drop_duplicates(inplace = True)

In [None]:
data.duplicated().sum()

Now dropping customers and descriptions missing fields from the dataset

In [None]:
data = data.loc[(data.CustomerID.isnull()==False) & (data.Description.isnull()==False)].copy()

In [None]:
data.isnull().sum()

In [None]:
data.info()

In [None]:
data.min()

In [None]:
data.max()

In [None]:
data.nunique()

Now main object is to remove outliers that is negative and 0 quantity is not at all possible so now we have to remove that and also the returns orders are to be seperated as the prediciton is for orders placements and not the returs.

In [None]:
data['Quantity'][data['Quantity'] < 0].nunique()

Here we see the values of quantity which are less than 0 and there are no quantity which is values as 0.

In [None]:
data = data[data['Quantity'] > 0]

In [None]:
data.min()

So now the Quantity outliers have been removed now we are left with the return orders

In [None]:
data.Quantity.describe()

In [None]:
data.head()

In [None]:
data.describe()

Sales Column created


In [None]:
data['Sales'] = data['Quantity'] * data['UnitPrice']
data[:5]

In [None]:
data[data['InvoiceNo'].str.startswith('c')]

In [None]:
data['Sales'].describe()

In [None]:
print('Duplicate invoice = ',data['InvoiceNo'].duplicated().sum())
print('Unique invoce = ',data['InvoiceNo'].nunique())

In [None]:
print('Unique Values :- ')
print('Country : ',data['Country'].nunique())
print('Quantity : ',data['Quantity'].nunique())
print('Items : ',data['Description'].nunique())

In [None]:
print("Most Occured :- ")
print('Country = ', data['Country'].mode()[0])
print('Description = ', data['Description'].mode()[0])

In [None]:
data.groupby(['Country']).sum().head()

In [None]:
data['InvoiceDate'] = pd.to_datetime(data.InvoiceDate, format='%m/%d/%Y %H:%M')

 New Columns inserted related to Date & Time

In [None]:
data.insert(loc=4, column='Day', value=data.InvoiceDate.dt.day)
data.insert( loc = 5,column='Month', value=data.InvoiceDate.dt.month)
data.insert( loc = 6,column='Year', value=data.InvoiceDate.dt.year)
data.insert( loc = 7,column='WeekDay', value=data.InvoiceDate.dt.weekday)
data.insert( loc = 8,column='Hour', value=data.InvoiceDate.dt.hour)
data.insert( loc = 9,column='Minute', value=data.InvoiceDate.dt.minute)
data.insert( loc = 10,column='Date', value=data.InvoiceDate.dt.date)


### Dataset after wrangling and preprocessing

In [None]:
data.head()

## EDA on the modified dataset

In [None]:
sns.catplot(data=data, x= 'Month', kind = 'count')
plt.title('month vs orders')

In [None]:
sns.catplot(data=data, x= 'Month', y='Sales', kind = 'bar')
plt.title('Month wise Sales ')

In [None]:
sns.catplot(data=data, x= 'WeekDay', y='Sales', kind = 'bar')
plt.title('Sales vs WeekDay ')
# Monday = 0 to Sunday = 6

In [None]:
data['InvoiceNo'].value_counts().head(10)

In [None]:
data['CustomerID'].value_counts().head(10)

In [None]:
data['StockCode'].value_counts().head()

In [None]:
plt.figure(figsize=(15,8))
#sns.countplot(data['Country'])
sns.countplot(data[data['Country'] != 'United Kingdom']['Country'] , order = data[data['Country'] != 'United Kingdom']['Country'].value_counts().index)

plt.xticks(rotation=90)
plt.title('Order Count Abroad (Outside UK) ')

In [None]:
descrip_count =  data.Description.value_counts().sort_values(ascending=False).iloc[0:15]

In [None]:
plt.figure(figsize=(15,8))
sns.barplot(y = descrip_count.values, x=descrip_count.index )
plt.xticks(rotation=90)
plt.title('Top 10 Products ')

In [None]:
sns.catplot(data=data, x = 'Hour', kind = 'count')
plt.title('Order count wrt Hour')



---


# EDA Result:


1.   Max Orders in month (Sep to Dec)
2.   WeekDay wise sales increases till Thursday then decreses 
3.   No Transaction on Saturday
4.   Maximum Sale in UK (as it is UK based company)
5.   Abroad Max Sales in Germany, France, etc. (refer above graph)
6.   Top Products (refer above graph)  
7.   Orders Increases till 12 then decreases 




---






# RFM Model

In [None]:
data['InvoiceDate'].max()

In [None]:
now = dt.date(2011,12,9) 

In [None]:
new_df = data.groupby(by='CustomerID', as_index=False)['Date'].max()
new_df.columns = ['CustomerID', 'LastPurchaseDate']
new_df[:5]

In [None]:
new_df['Recency'] =  new_df['LastPurchaseDate'].apply(lambda x : (now-x).days)
new_df.drop('LastPurchaseDate',axis = 1, inplace = True)
new_df[:5]

In [None]:
new_df2 = data.groupby(by = 'CustomerID', as_index=False)['InvoiceNo'].count()
new_df2.columns = ['CustomerID','Frequency']
new_df2[:4]

In [None]:
new_df3 = data.groupby(by='CustomerID',as_index=False).agg({'Sales': 'sum'})
new_df3.columns = ['CustomerID','Monetary']
new_df3[:4]

In [None]:
temp = new_df.merge(new_df2, on = 'CustomerID')
rfm_df = temp.merge(new_df3, on = 'CustomerID')
rfm_df.set_index('CustomerID',inplace = True)
rfm_df.head()

In [None]:
rfm_df['R_quartile'] = pd.qcut(rfm_df['Recency'], 4, ['1','2','3','4'])
rfm_df['F_quartile'] = pd.qcut(rfm_df['Frequency'], 4, ['4','3','2','1'])
rfm_df['M_quartile'] = pd.qcut(rfm_df['Monetary'], 4, ['4','3','2','1'])
rfm_df.head()

In [None]:
rfm_df['RFM_Score'] = rfm_df.R_quartile.astype(str)+ rfm_df.F_quartile.astype(str) + rfm_df.M_quartile.astype(str)
rfm_df.head()

Score best : 1 worst : 4

# Segmentations below using score

Top/Best Customers

In [None]:
rfm_df[rfm_df['RFM_Score']==str(111)].head()

Loyal Customers

In [None]:
rfm_df[rfm_df['F_quartile']=='1'].head()

Customer who spent most

In [None]:
rfm_df[rfm_df['M_quartile']=='1'].head()

Lost Customers

In [None]:
rfm_df[rfm_df['RFM_Score']==str(444)].head()

In [None]:
rfm_df[rfm_df['RFM_Score']==str(111)].shape

# Word Cloud for Best Customer

## preparing dataset of best customer for product description analysis

In [None]:
temp2 = rfm_df[rfm_df['RFM_Score']==str(111)]
temp2.head()

In [None]:
temp3 = pd.DataFrame()

In [None]:
temp2.reset_index(level=0, inplace=True)
temp2.head()

In [None]:
print(data.shape)
print(temp2.shape)

Right Join on temp2 and data 


In [None]:
temp3 =  pd.merge(temp2,data.drop_duplicates(),on='CustomerID',how='right')

In [None]:
temp3.shape

In [None]:
temp3['CustomerID'].nunique()

In [None]:
temp2['CustomerID'].nunique()

In [None]:
data['CustomerID'].nunique()

In [None]:
temp3.dropna(inplace=True)
temp3['CustomerID'].nunique()

In [None]:
temp3.shape

In [None]:
temp3.head()

In [None]:
#Fetch wordcount for each Description
temp3['word_count'] = temp3['Description'].apply(lambda x: len(str(x).split(" ")))
temp3[['Description','word_count']].head()

temp4 contains description of products bought by best customers, this list of description will be used for wordcloud

In [None]:
temp4 = temp3[['Description','word_count']]

## temp4 will be used for word cloud and product description analysis

In [None]:
temp4.head()

# Word Count Summary

In [None]:
temp4.word_count.describe()

In [None]:
#Identify common words
freq = pd.Series(' '.join(temp4['Description']).split()).value_counts()[:20]
freq

In [None]:
#Identify uncommon words
freq1 =  pd.Series(' '.join(temp4 ['Description']).split()).value_counts()[-20:]
freq1


In [None]:
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

In [None]:
stop_words = set(stopwords.words("english"))
new_words = ['RED','PINK', 'BLUE', 'OF', 'BROWN',"BLACK"]
stop_words = stop_words.union(new_words)


In [None]:
for i in new_words:
  if i in stop_words:
    print(i)


In [None]:
corpus = []
for i in range(0, 164373):
    #Remove punctuations
    text = re.sub('[^a-zA-Z]', ' ', temp4['Description'][i])
    
    #Convert to lowercase
    text = text.lower()
    
    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    ##Convert to list from string
    text = text.split()
    
    ##Stemming
    ps=PorterStemmer()
    #Lemmatisation
    lem = WordNetLemmatizer()
    text = [lem.lemmatize(word) for word in text if word not in  
            stop_words] 
    text = " ".join(text)
    corpus.append(text)

In [None]:
corpus[:10]

In [None]:
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
wordcloud = WordCloud(    #background_color='white',
                          stopwords=stop_words,
                          max_words=200,
                          max_font_size=50, 
                          random_state=42
                         ).generate(str(corpus))

In [None]:
plt.figure(figsize=(25,10))
plt.imshow(wordcloud)
plt.axis('off')
plt.title('Word Cloud for Best Customer\'s Products')

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import re
cv=CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=10000, ngram_range=(1,3))
X=cv.fit_transform(corpus)

In [None]:
list(cv.vocabulary_.keys())[:20]

In [None]:
#Most frequently occuring words
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in      
                   vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                       reverse=True)
    return words_freq[:n]

# Getting top 20 words in top_df dataframe

In [None]:
top_words = get_top_n_words(corpus, n=20)
top_df = pd.DataFrame(top_words)
top_df.columns=["Word", "Freq"]
top_df[:20]

In [None]:

sns.catplot(data=top_df,x='Word',y='Freq',kind='bar')
plt.xticks(rotation = 60)


# Thank you for your time 
## Do upvote and comment if you find this notebook helpful 
## Please comment any suggestion