In [None]:
# Importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.subplots as sp

import warnings
warnings.filterwarnings("ignore")

colours = ['#1f77b4', '#fc6c44', '#2b8a2b', '#fc7c7c', '#9467bd', '#4ba4ad', '#c7ad18', '#7f7f7f', '#69d108'];
axtitle_dict = {'family': 'serif','color':  '#010D36','weight': 'bold','size': 16}
axlab_dict = {'family': 'serif', 'color': 'black','size': 14}

In [None]:
df= pd.read_csv('/kaggle/input/customer-behaviour-tourism-portal/Customer behaviour Tourism.csv')
df

In [None]:
print(f"The dataset contains {df.shape[0]} rows and {df.shape[1]} columns")

In [None]:
df.info()

In [None]:
cat_columns= df.select_dtypes(exclude=['int64', 'float64'] )

for col in cat_columns:
    print(f"Value counts for column '{col}':")
    print(df[col].unique())
    print()

In [None]:
#Let's first fix the multiple values in preferred device
# Define a mapping for replacement
device_mapping= {'iOS and Android': 'Mobile',
          'iOS' : 'Mobile',
          'ANDROID' : 'Mobile',
          'Android': 'Mobile',
          'Android OS' : 'Mobile',
          'Other': 'Mobile',
          'Others' : 'Mobile',
          'Tab' : 'Mobile'}
df['preferred_device'] = df['preferred_device'].replace(device_mapping)
df['preferred_device'].unique()

In [None]:
# "yearly_avg_Outstation_checkins" has '* ' in it, Let's replace it with NAN
df['yearly_avg_Outstation_checkins']= df['yearly_avg_Outstation_checkins'].replace('*',np.nan)
df['yearly_avg_Outstation_checkins'] = pd.to_numeric(df['yearly_avg_Outstation_checkins'], errors='coerce', downcast='integer')

df['yearly_avg_Outstation_checkins'].unique()

In [None]:
# "member_in_family" column has "['Yes' 'No' 'Yeso' nan '1' '0']", let's change all to numeric
page_mapping= {'Yes': 1,
          'No' : 0,
          'Yeso' : 1}
df['following_company_page'] = df['following_company_page'].replace(page_mapping)
df['following_company_page'].unique()

In [None]:
# "following_company_page" column has "Three", let's change it to 3
df['member_in_family']= df['member_in_family'].replace('Three',3)

# Post changing, let's fix the datatype
# Post changing, let's fix the datatype
df['member_in_family'] = pd.to_numeric(df['member_in_family'], errors='coerce', downcast='integer')
df['member_in_family'].unique()

In [None]:
# "following_company_page" column has "Three", let's change it to 3
df['working_flag']= df['working_flag'].replace('0','No')
df['working_flag'].unique()

- following_company_page, Adult Flag, Traevelling_network_rating should be categorical type

In [None]:
df[["travelling_network_rating", "Adult_flag"]]= df[["travelling_network_rating", "Adult_flag"]].astype("object")

In [None]:
df.info()

In [None]:
num_columns = df.select_dtypes(exclude=['object']).drop(columns=['UserID'])

#Negative values : The dataset features can not have negative values, hence let's see.
negative_values = (num_columns < 0).any()
print("Columns with negative values:")
print(negative_values[negative_values].index)


In [None]:
#- Below are the columns & corresponding missing values
df = df.applymap(lambda x: np.nan if x == 'nan' else x) #Replacing all string NaN values to Np.nan, in case any

In [None]:
#Let's see how much are the missing values from the datset
RED, BOLD, RESET = '\033[91m', '\033[1m','\033[0m'
total_missing = df.isnull().sum().sum()
total_cells = df.size
missing_percentage = (total_missing / total_cells) * 100
print(f"The total number of missing values are {BOLD}{RED}{total_missing}{RESET}, which is {BOLD}{RED}{missing_percentage:.2f}%{RESET} of total data.")


In [None]:
# Lets check count & % of missing values in the dataset
missing = df.columns[df.isna().any()].tolist()
total_rows = len(df)
for column in missing:
    missing_count = df[column].isna().sum()
    missing_percentage = (missing_count / total_rows) * 100
    print(f"{BOLD}{column}{RESET} has {BOLD}{RED}{missing_count}{RESET} missing values, which is {BOLD}{RED}{missing_percentage:.2f}%{RESET} of the column.")


In [None]:
cat_columns = ['preferred_device', 'preferred_location_type', 'following_company_page', 'working_flag', 'Adult_flag']
for i in cat_columns:
    df[i].fillna(df[i].mode()[0], inplace = True)

In [None]:
num_columns = ['Yearly_avg_view_on_travel_page', 'total_likes_on_outstation_checkin_given', 'yearly_avg_Outstation_checkins', 'Yearly_avg_comment_on_travel_page', 'Daily_Avg_mins_spend_on_traveling_page']
for column in num_columns:
    median_value = df[column].median()
    df[column].fillna(median_value, inplace=True)

In [None]:
#Let's check missing values post treatmenr
print('Missing Values in the dataset after treatment :', df.isnull().sum().sum())

In [None]:
df.duplicated().sum()

In [None]:
df[["travelling_network_rating", "Adult_flag"]]= df[["travelling_network_rating", "Adult_flag"]].astype("object")
df.info()

In [None]:
df.describe().T

In [None]:
df.select_dtypes(include = ['object']).describe().T

In [None]:
# Distribution of Target Variable
fig = px.pie(
    df.assign(ClassMap=df.Taken_product.map({'No': "Not Taken", "Yes": "Taken"})),
    names="ClassMap", hole=0.5,color_discrete_sequence=["#79a5db", "#e0a580"])
fig.update_layout(height=450,width=600, font_color="#28838a",title_font_size=16,  showlegend=False,)
fig.add_annotation( x=0.5, y=0.5, align="center", xref="paper",yref="paper", showarrow=False, font_size=20, text="Target<br>Overview",)
fig.update_traces(hovertemplate=None, textposition="outside", texttemplate="%{label}<br>%{value} - %{percent}",
    textfont_size=16,rotation=-20, marker_line_width=25,  marker_line_color='#ffffff',)
fig.show()

In [None]:
num_columns = df.select_dtypes(include=['float64', 'int64']).columns.drop('UserID')

# Distribution of Numeric Columns
plt.rcParams['axes.facecolor'] = 'white'
fig = plt.figure(figsize=[40, 20])
fig.suptitle('DISTRIBUTION OF DATA', fontsize=18, fontweight='bold')
fig.subplots_adjust(top=0.92)
fig.subplots_adjust(hspace=0.5, wspace=0.4)
for i, col in enumerate(num_columns):
    ax = fig.add_subplot(3, 3, i + 1)
    # Plot the histogram
    sns.distplot(df[col], color=colours[i], ax=ax)
    ax.axvline(df[col].quantile(q=0.25), color='green', linestyle='--', label='25% Quartile')
    ax.axvline(df[col].mean(), color='red', linestyle='--', label='Mean')
    ax.axvline(df[col].median(), color='black', linestyle='--', label='Median')
    ax.axvline(df[col].quantile(q=0.75), color='blue', linestyle='--', label='75% Quartile')
    skewness = round(df[col].skew(), 2)
    kurtosis = round(df[col].kurtosis(), 2)
    description_lines = [line for line in str(df[col].describe().round(2)).split('\n') if 'Name' not in line and 'dtype' not in line]
    description_text = '\n'.join(description_lines)
    description_text += f"\nSkewness: {skewness:.2f}\nKurtosis: {kurtosis:.2f}"
    ax.annotate(description_text, xy=(1.01, 0.2), xycoords='axes fraction', fontsize=15)
    ax.set_xlabel(f'{col}', fontdict=axlab_dict)
    ax.set_title(f'{col.upper()}', fontdict=axtitle_dict)
    ax.legend(fontsize=10)
plt.show()

### <b><span style='color:#fa762f'> | </span><span style='color:#28838a'> Distribution of Categorical variables </span></b> 

In [None]:
df.travelling_network_rating.value_counts()

In [None]:
df= df[df['travelling_network_rating'] != 10]

In [None]:
cat_colums = df.select_dtypes(include = ['object'])
def univariateAnalysis_category(cols):
    print("Distribution of", cols)
    print("----------------------------------------------------------------")
    colors = ['#79a5db', '#e0a580', '#6fab90', '#896ca8', '#ADD8E6']
    value_counts = cat_colums[cols].value_counts()
    # Count plot
    fig = px.bar(
        value_counts,
        x=value_counts.index,
        y=value_counts.values,
        title=f'Distribution of {cols}',
        labels={'x': 'Categories', 'y': 'Count'},color_discrete_sequence=[colors])
    fig.update_layout(width=700)
    fig.update_layout(plot_bgcolor='#ffffff', paper_bgcolor='#ffffff')
    fig.show()
    # Donut chart
    percentage = (value_counts / value_counts.sum()) * 100
    fig = px.pie(
        values=percentage, names=value_counts.index,
        labels={'names': 'Categories', 'values': 'Percentage'}, hole=0.5,color_discrete_sequence=colors)
    fig.add_annotation(
        x=0.5, y=0.5, align="center", xref="paper",
        yref="paper", showarrow=False, font_size=15, text=f'{cols}')
    fig.update_layout(legend=dict(x=0.9, y=0.5))
    fig.update_layout(width=700)
    fig.show()
    print("       ")
for x in cat_colums:
    univariateAnalysis_category(x)

In [None]:
fig = plt.figure(figsize=[32, 15])
fig.suptitle('Bivariate Analysis : Distribution of Columns with Product Taken ', fontsize=18, fontweight='bold')
fig.subplots_adjust(top=0.92)
fig.subplots_adjust(hspace=0.5, wspace=0.4)
for i, col in enumerate(num_columns):
    a = fig.add_subplot(3, 3, i+1)
    sns.distplot(df[df['Taken_product'] == "No"][col], color='#142863', ax=a, hist=False, label='Not Taken')
    sns.distplot(df[df['Taken_product'] == "Yes"][col], color='#f2634e', ax=a, hist=False, label='Taken')
    a.set_title(col, fontdict=axtitle_dict)
    a.legend(fontsize=15)

In [None]:
#Correlation heatmap
corr = df[num_columns].corr(method='pearson')
fig = plt.subplots(figsize=(12, 6))
ax = sns.heatmap(corr, annot=True, fmt='.2f', cbar=None, linewidth=0.9)
ax.set_xticklabels([label.get_text().replace('_', '\n') for label in ax.get_xticklabels()], rotation=0, horizontalalignment='center')
ax.set_title(' Correlation Matrix', fontdict=axtitle_dict)
plt.show()

In [None]:
#Outliers in each Columns
plt.rcParams['axes.facecolor'] = 'white'
fig = plt.figure(figsize=[32,24])
fig.suptitle('BOXPLOT OF ALL COLUMNS', fontsize=18, fontweight='bold')
fig.subplots_adjust(top=0.92);
fig.subplots_adjust(hspace=0.5, wspace=0.4);
for i ,col in enumerate(num_columns):
    ax1 = fig.add_subplot(6,3, i+1);
    ax1 = sns.boxplot(data = df, x=col ,  color= colours[i]);
    ax1.set_title(f'{col}', fontdict=axtitle_dict)
    ax1.set_xlabel(f'{col}', fontdict=axlab_dict)

In [None]:
#Checking numbers of observations beyond Upper & Lower Limit
Q5 = df[num_columns].quantile(0.05)
Q95 = df[num_columns].quantile(0.95)
UL = Q95
LL = Q5
outliers = ((df[num_columns] > UL) | (df[num_columns] < LL)).sum()
print("Number of Observations Beyond Upper & Lower Limit for Each Column:")
display(outliers)

In [None]:
#Function which returns the Upper and Lower limit to detect outliers for each feature
def treat_outlier(col):
    q5  , q95 = np.percentile(col, [5, 95])
    return q5, q95

for i in num_columns:
    LR, UR  = treat_outlier(df[i])
    df[i] = np.where(df[i] > UR, UR, df[i])
    df[i] = np.where(df[i] < LR, LR, df[i])

In [None]:
#Outliers in each Columns
plt.rcParams['axes.facecolor'] = 'white'
fig = plt.figure(figsize=[32,24])
fig.suptitle('BOXPLOT OF ALL COLUMNS POST TREATMENT (SCALED)', fontsize=18, fontweight='bold')
fig.subplots_adjust(top=0.92);
fig.subplots_adjust(hspace=0.5, wspace=0.4);
for i ,col in enumerate(num_columns):
    ax1 = fig.add_subplot(6,3, i+1);
    ax1 = sns.boxplot(data = df, x=col ,  color= colours[i]);
    ax1.set_title(f'{col}', fontdict=axtitle_dict)
    ax1.set_xlabel(f'{col}', fontdict=axlab_dict)

### Inferences from EDA

    
- **Device Prefrence :**  With 90.58% of users preferring mobile devices, prioritize mobile optimization for all digital content, including advertisements, website, and applications

- **Enhancing Engagement:** The majority of users (71.93%) do not follow the company page, indicating a potential gap in user engagement. Develop strategies to increase company page followers by providing valuable and relevant content. Engage users through interactive posts, surveys, and promotions to foster a sense of community and loyalty.
4. **Understanding Non-Working Majority:** The dominance of non-working users (84.62%) suggests the need for tailoring marketing strategies. Consider offering flexible travel packages, discounts during off-peak hours, or special promotions for this user segment.

5. **Addressing Travelling Network Ratings:** The distribution of ratings, particularly with a significant percentage at '3' (31.23%) and '4' (29.38%), highlights the importance of addressing user feedback. Implement improvements to enhance overall satisfaction and provide a positive user experience

6. **Demographic Tailoring for User Segments:** Considering the diverse distribution in the Adult Flag category, with significant proportions in '0.0' (42.92%) and '1.0' (40.55%), tailoring services and promotions based on user age groups can lead to more effective and targeted marketing efforts.



**The data is cleaned & ready to be used for Modeling post pre-processing**