In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency, ttest_ind, pearsonr

In [8]:
cab_data = pd.read_csv("Cab_Data.csv")
customer_id = pd.read_csv("Customer_ID.csv")
transaction_id = pd.read_csv("Transaction_ID.csv")
city = pd.read_csv("City.csv")

In [10]:
cab_data.head()
cab_data.dtypes

customer_id.head()
customer_id.dtypes

transaction_id.head()
transaction_id.dtypes

city.head()
city.dtypes

City          object
Population    object
Users         object
dtype: object

In [11]:
# Merge transaction_id and customer_id dataframes on customer ID column
transaction_customer = pd.merge(transaction_id, customer_id, on='Customer ID')

# Merge cab_data and city dataframes on City column
cab_city = pd.merge(cab_data, city, on='City')

In [14]:
# Create a new column in cab_city dataframe for profit
cab_city['Profit'] = cab_city['Price Charged'] - cab_city['Cost of Trip']

In [15]:
# Merge transaction_customer and cab_city dataframes on Transaction ID column
master_data = pd.merge(transaction_customer, cab_city, on='Transaction ID')
# The master_data dataframe contains all the relevant information from the original datasets, including details of transactions, customer demographics, and cab company details.

In [21]:
# Check for duplicates in master_data dataframe
master_data.duplicated().sum()

0

In [22]:
# Drop any duplicate rows from master_data dataframe
master_data.drop_duplicates(inplace=True)

In [19]:
# Check for missing values in the master_data dataframe
master_data.isna().sum()

Transaction ID        0
Customer ID           0
Payment_Mode          0
Gender                0
Age                   0
Income (USD/Month)    0
Date of Travel        0
Company               0
City                  0
KM Travelled          0
Price Charged         0
Cost of Trip          0
Population            0
Users                 0
Profit                0
dtype: int64

In [20]:
# Detect and remove outliers in the Profit column using Z-score
from scipy import stats
z_scores = stats.zscore(master_data['Profit'])
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 3)
master_data = master_data[filtered_entries]

# EDA #

In [26]:
# Merge the datasets
merged_data = pd.merge(cab_data, transaction_id, on='Transaction ID')
merged_data = pd.merge(merged_data, customer_id, on='Customer ID')
merged_data = pd.merge(merged_data, city, on='City')

In [29]:
merged_data.head()

Unnamed: 0,Transaction ID,Date of Travel,Company,City,KM Travelled,Price Charged,Cost of Trip,Customer ID,Payment_Mode,Gender,Age,Income (USD/Month),Population,Users
0,10000011,42377,Pink Cab,ATLANTA GA,30.45,370.95,313.635,29290,Card,Male,28,10813,814885,24701
1,10351127,43302,Yellow Cab,ATLANTA GA,26.19,598.7,317.4228,29290,Cash,Male,28,10813,814885,24701
2,10412921,43427,Yellow Cab,ATLANTA GA,42.55,792.05,597.402,29290,Card,Male,28,10813,814885,24701
3,10000012,42375,Pink Cab,ATLANTA GA,28.62,358.52,334.854,27703,Card,Male,27,9237,814885,24701
4,10320494,43211,Yellow Cab,ATLANTA GA,36.38,721.1,467.1192,27703,Card,Male,27,9237,814885,24701


In [27]:
# Explore the data
print(merged_data.head())
print(merged_data.describe())
print(merged_data.info())

   Transaction ID  Date of Travel     Company        City  KM Travelled  \
0        10000011           42377    Pink Cab  ATLANTA GA         30.45   
1        10351127           43302  Yellow Cab  ATLANTA GA         26.19   
2        10412921           43427  Yellow Cab  ATLANTA GA         42.55   
3        10000012           42375    Pink Cab  ATLANTA GA         28.62   
4        10320494           43211  Yellow Cab  ATLANTA GA         36.38   

   Price Charged  Cost of Trip  Customer ID Payment_Mode Gender  Age  \
0         370.95      313.6350        29290         Card   Male   28   
1         598.70      317.4228        29290         Cash   Male   28   
2         792.05      597.4020        29290         Card   Male   28   
3         358.52      334.8540        27703         Card   Male   27   
4         721.10      467.1192        27703         Card   Male   27   

   Income (USD/Month) Population     Users  
0               10813   814,885    24,701   
1               10813   81

## Create a list of hypotheses to investigate

In [31]:
# Hypothesis 1: Male customers prefer Yellow Cab over Pink Cab
# Create a contingency table of cab company and gender
cont_table = pd.crosstab(merged_data['Company'], merged_data['Gender'])

In [34]:
# Conduct a chi-squared test to determine if there is a significant difference
# in the proportion of male customers between the two cab companies
chi2_stat, p_value, dof, expected = chi2_contingency(cont_table)
alpha = 0.05
if p_value < alpha:
    print("Hypothesis 1: Reject null hypothesis. There is a significant difference in the proportion of male customers between the two cab companies.")
else:
    print("Hypothesis 1: Fail to reject null hypothesis. There is no significant difference in the proportion of male customers between the two cab companies.")

Hypothesis 1: Reject null hypothesis. There is a significant difference in the proportion of male customers between the two cab companies.


In [36]:
# Hypothesis 2: The cost of the trip is correlated with the distance traveled
# Calculate the correlation coefficient and p-value between distance traveled and price charged
corr_coef, p_value = pearsonr(merged_data['KM Travelled'], merged_data['Price Charged'])
if p_value < alpha:
    print("Hypothesis 2: Reject null hypothesis. There is a significant correlation between distance traveled and price charged.")
else:
    print("Hypothesis 2: Fail to reject null hypothesis. There is no significant correlation between distance traveled and price charged.")

Hypothesis 2: Reject null hypothesis. There is a significant correlation between distance traveled and price charged.


In [37]:
# Hypothesis 3: Customers paying with cash have a higher income than customers paying with a card
# Calculate the mean income of customers paying with cash and those paying with a card
cash_mean = merged_data.loc[merged_data['Payment_Mode'] == 'Cash', 'Income (USD/Month)'].mean()
card_mean = merged_data.loc[merged_data['Payment_Mode'] == 'Card', 'Income (USD/Month)'].mean()

# Conduct a t-test to determine if there is a significant difference in the mean income between the two payment modes
t_stat, p_value = ttest_ind(merged_data.loc[merged_data['Payment_Mode'] == 'Cash', 'Income (USD/Month)'],
                            merged_data.loc[merged_data['Payment_Mode'] == 'Card', 'Income (USD/Month)'])
if p_value < alpha:
    print("Hypothesis 3: Reject null hypothesis. Customers paying with cash have a higher income than customers paying with a card.")
else:
    print("Hypothesis 3: Fail to reject null hypothesis. There is no significant difference in the mean income between the two payment modes.")

Hypothesis 3: Fail to reject null hypothesis. There is no significant difference in the mean income between the two payment modes.
