In [15]:
import pandas as pd
import numpy as np

#Statistic
from scipy import stats

#Visualization
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

#Preprocessing
from sklearn.preprocessing import OneHotEncoder, StandardScaler

#Machine Learning
from sklearn.cluster import KMeans

# Dataframe

In [2]:
df=pd.read_csv("C:\\Users\\Saravanan\\OneDrive\\Desktop\\Datasets_data\\Tips.csv")

In [3]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


# Observation

# Unsupervised

#Continuous - total_bill, tip
#Category - sex,	smoker,	day,	time,	size


In [46]:
continuous_columns = ["total_bill", "tip"]
category_columns = ["sex",	"smoker",	"day", "time", "size"]

# Data Cleaning

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   total_bill   244 non-null    float64
 1   tip          244 non-null    float64
 2   size         244 non-null    int64  
 3   sex_Female   244 non-null    bool   
 4   sex_Male     244 non-null    bool   
 5   smoker_No    244 non-null    bool   
 6   smoker_Yes   244 non-null    bool   
 7   day_Fri      244 non-null    bool   
 8   day_Sat      244 non-null    bool   
 9   day_Sun      244 non-null    bool   
 10  day_Thur     244 non-null    bool   
 11  time_Dinner  244 non-null    bool   
 12  time_Lunch   244 non-null    bool   
dtypes: bool(10), float64(2), int64(1)
memory usage: 8.2 KB


In [17]:
df.isnull().sum()

total_bill     0
tip            0
size           0
sex_Female     0
sex_Male       0
smoker_No      0
smoker_Yes     0
day_Fri        0
day_Sat        0
day_Sun        0
day_Thur       0
time_Dinner    0
time_Lunch     0
dtype: int64

# Statistical Analysis

Continuous - Central Limit Theorem, 1-Tailed Test (one-sample t-test)

Continuous vs. Continuous - Correlation, 2-Tailed Test (two-sample t-test)

Continuous vs. Categorical - ANOVA (Analysis of Variance)

Categorical vs. Categorical - Chi-Square Test

In [10]:
# Identify columns with non-numeric data
non_numeric_columns = df.select_dtypes(include=['object']).columns

# Apply one-hot encoding to each non-numeric column
for column in non_numeric_columns:
    df = pd.get_dummies(df, columns=[column])

# Now, try calculating the correlation matrix again
correlation_matrix = df.corr()


In [11]:
df.corr()

Unnamed: 0,total_bill,tip,size,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
total_bill,1.0,0.675734,0.598315,-0.144877,0.144877,-0.085721,0.085721,-0.086168,0.054919,0.122953,-0.138174,0.183118,-0.183118
tip,0.675734,1.0,0.489299,-0.088862,0.088862,-0.005929,0.005929,-0.055463,-0.00279,0.125114,-0.095879,0.121629,-0.121629
size,0.598315,0.489299,1.0,-0.086195,0.086195,0.133178,-0.133178,-0.142184,-0.041121,0.193054,-0.072598,0.103411,-0.103411
sex_Female,-0.144877,-0.088862,-0.086195,1.0,-1.0,0.002816,-0.002816,0.07106,-0.053957,-0.168106,0.194445,-0.205231,0.205231
sex_Male,0.144877,0.088862,0.086195,-1.0,1.0,-0.002816,0.002816,-0.07106,0.053957,0.168106,-0.194445,0.205231,-0.205231
smoker_No,-0.085721,-0.005929,0.133178,0.002816,-0.002816,1.0,-1.0,-0.244316,-0.155744,0.181624,0.128534,-0.054921,0.054921
smoker_Yes,0.085721,0.005929,-0.133178,-0.002816,0.002816,-1.0,1.0,0.244316,0.155744,-0.181624,-0.128534,0.054921,-0.054921
day_Fri,-0.086168,-0.055463,-0.142184,0.07106,-0.07106,-0.244316,0.244316,1.0,-0.216319,-0.195451,-0.169608,-0.058159,0.058159
day_Sat,0.054919,-0.00279,-0.041121,-0.053957,0.053957,-0.155744,0.155744,-0.216319,1.0,-0.500682,-0.43448,0.462709,-0.462709
day_Sun,0.122953,0.125114,0.193054,-0.168106,0.168106,0.181624,-0.181624,-0.195451,-0.500682,1.0,-0.392566,0.418071,-0.418071


# Hypothesis Testing

In [47]:
def hypothesisTesting(continuous_columns=[], category_columns=[], sampling_rate=0.20, sampling_frac=0.05):
    # Central Limit Theorem
    oneContinuousColumn_result = {}
    hypothesis_df = pd.DataFrame(columns=df.columns, index=df.columns)
    
    if continuous_columns:
        samplesize = int(sampling_rate * len(df[continuous_columns[0]]))
        for column in continuous_columns:
            population = df[column].values
            population_mean = population.mean()
            sample_mean = []
            for i in range(40):
                sample = np.random.choice(population, samplesize)
                sample_mean.append(sample.mean())
            oneContinuousColumn_result[column] = {
                "Population Mean": population_mean,
                "Sample Means": {np.mean(sample_mean)}
            }

    # 1-Tailed Test
    if continuous_columns:
        samplesize = int(sampling_rate * len(df[continuous_columns[0]]))
        for column in continuous_columns:
            H0_accepted = 0
            H0_rejected = 0
            for i in range(samplesize):
                sample = df[column].sample(frac=sampling_frac)
                t_test, p_value = stats.ttest_1samp(sample, df[column].mean())
                if p_value > 0.5:
                    H0_accepted += 1
                else:
                    H0_rejected += 1

            if H0_accepted > H0_rejected:
                oneContinuousColumn_result[column].update({
                    "H0_accepted": H0_accepted,
                    "H0_rejected": H0_rejected,
                    "Conclusion": "H0 is accepted, Ha is rejected, There is no significant effect"
                })
                hypothesis_df.loc[column, column] = "There is a relationship"
            else:
                oneContinuousColumn_result[column].update({
                    "H0_accepted": H0_accepted,
                    "H0_rejected": H0_rejected,
                    "Conclusion": "H0 is rejected, Ha is accepted, There is a significant effect"
                })
                hypothesis_df.loc[column, column] = "There is no relationship"

    # 2-Tailed Test
    for i in range(len(continuous_columns) - 1):
        column_1 = continuous_columns[i]
        for column_2 in continuous_columns[i + 1:]:
            H0_accepted = 0
            H0_rejected = 0
            for i in range(20):
                sample1 = df[column_1].sample(frac=sampling_frac)
                sample2 = df[column_2].sample(frac=sampling_frac)
                t_test, p_value = stats.ttest_ind(sample1, sample2)
                if p_value > 0.5:
                    H0_accepted += 1
                else:
                    H0_rejected += 1

            if H0_accepted > H0_rejected:
                hypothesis_df.loc[column_1, column_2] = "There is a relationship"
                hypothesis_df.loc[column_2, column_1] = "There is a relationship"
            else:
                hypothesis_df.loc[column_1, column_2] = "There is no relationship"
                hypothesis_df.loc[column_2, column_1] = "There is no relationship"

    # Chi-Square Test
    if category_columns:
        for i in range(len(category_columns) - 1):
            column_1 = category_columns[i]
            for column_2 in category_columns[i + 1:]:
                data = pd.crosstab(df[column_1], df[column_2])
                observed_values = data.values
                chi2_stat, p_value, _, _ = stats.chi2_contingency(observed_values)

                if p_value > 0.05:
                    hypothesis_df.loc[column_1, column_2] = "There is no relationship"
                    hypothesis_df.loc[column_2, column_1] = "There is no relationship"
                else:
                    hypothesis_df.loc[column_1, column_2] = "There is a relationship"
                    hypothesis_df.loc[column_2, column_1] = "There is a relationship"

    # ANOVA
    if continuous_columns and category_columns:
        for category_column in category_columns:
            for continuous_column in continuous_columns:
                group = df[category_column].unique()
                data = {}
                for i in group:
                    data[i] = df[continuous_column][df[category_column] == i]

                f_value, p_value = stats.f_oneway(*[data[i] for i in group])
                if p_value > 0.05:
                    hypothesis_df.loc[category_column, continuous_column] = "There is a relationship"
                    hypothesis_df.loc[continuous_column, category_column] = "There is a relationship"
                else:
                    hypothesis_df.loc[category_column, continuous_column] = "There is no relationship"
                    hypothesis_df.loc[continuous_column, category_column] = "There is no relationship"

    return oneContinuousColumn_result, hypothesis_df

In [None]:
ContinesColumn_result, comperativeColumn_result = hypothesisTesting(continuous_columns , category_columns)

In [None]:
ContinesColumn_result_df = pd.DataFrame(ContinesColumn_result)
ContinesColumn_result_df
     

In [None]:
sns.heatmap(comperativeColumn_result == 'There is a relationship', annot=True, cmap='coolwarm')
plt.title('Hypothesis Test Results')
plt.show()