# **Tax Trip Statistical Data Analysis**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import scipy.stats as st
import warnings
warnings.filterwarnings('ignore')

**Loading the dataset**

In [None]:
df = pd.read_csv("/content/2023_Yellow_Taxi_Trip_dataset.xls")

In [None]:
df.head(10)

# **EDA(Exploratory Data Analysis)**

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'], errors='coerce')
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'], errors='coerce')

In [None]:
df.dtypes

In [None]:
df['duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
df['duration'].dt.total_seconds()/60

In [None]:
df = df[['passenger_count','payment_type','fare_amount','trip_distance','duration']]

In [None]:
df

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)
df

In [None]:
df[df.duplicated()]

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df['passenger_count'].value_counts(normalize=True)

In [None]:
df['payment_type'].value_counts(normalize=True)

In [None]:
df = df[df['payment_type']<3]
df  = df[(df['passenger_count']>0)&(df['passenger_count']<6)]

In [None]:
df.shape

In [None]:
df['payment_type'].replace([1,2],['Card','Cash'],inplace=True)

In [None]:
df.describe()

In [None]:
df = df[
    (df['fare_amount'] > 0) &
    (df['trip_distance'] > 0) &
    (df['duration'].dt.total_seconds() > 0)
]

In [None]:
for col in ['fare_amount','trip_distance','duration']:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    IQR = q3-q1
    lower_bound = q1-1.5*IQR
    upper_bound = q3+1.5*IQR
    df = df[(df[col]>=lower_bound) & (df[col]<=upper_bound)]

In [None]:
df

In [None]:
plt.figure(figsize=(12,5))
plt.hist(df[df['payment_type'] == 'Card']['fare_amount'],
         histtype='barstacked', bins=20, edgecolor='k',
         color='#FA643F', label='Card')
plt.hist(df[df['payment_type'] == 'Cash']['fare_amount'],
         histtype='barstacked', bins=20, edgecolor='k',
         color='#FFBCAB', label='Cash')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
plt.hist(df[df['payment_type'] == 'Card']['fare_amount'],
         histtype='barstacked', bins=20, edgecolor='k',
         color='#FA643F', label='Card')
plt.hist(df[df['payment_type'] == 'Cash']['fare_amount'],
         histtype='barstacked', bins=20, edgecolor='k',
         color='#FFBCAB', label='Cash')
plt.title('Fare Amount Distribution by Payment Type')
plt.xlabel('Fare Amount')
plt.ylabel('Frequency')
plt.legend()
plt.show()


In [None]:
df

In [None]:
df.groupby('payment_type').agg({'fare_amount':['mean','std'], 'trip_distance':['mean','std']})

In [None]:
plt.title('Preference of Payment Type')
plt.pie(df['payment_type'].value_counts(normalize=True),
        labels=df['payment_type'].value_counts().index,
        startangle=90,shadow=True,autopct='%1.1f%%',
        colors=['#FA643F', '#FFBCAB'])
plt.show()

In [None]:
df['passenger_count'].describe()

In [None]:
df

In [None]:
df.groupby(['payment_type','passenger_count'])[['passenger_count']].count()

In [None]:
passenger_count = df.groupby(['payment_type','passenger_count'])[['passenger_count']].count()
passenger_count = passenger_count.rename(columns={'passenger_count': 'count'})
passenger_count.reset_index(inplace=True)
passenger_count['perc'] = (passenger_count['count'] / passenger_count['count'].sum()) * 100
passenger_count

In [None]:
df = pd.DataFrame(columns=['payment_type', 1, 2, 3, 4, 5])
df['payment_type'] = ['Card', 'Cash']

# Assign values to the 'Card' row from rows 0 to 4 in 'perc' column
df.iloc[0, 1:] = passenger_count.loc[0:4, 'perc'].values

# Assign same value (from row 4) 5 times to the 'Cash' row
df.iloc[1, 1:] = [passenger_count.loc[4, 'perc']] * 5

df


In [None]:
ax = df.plot(
    x='payment_type',
    kind='barh',
    stacked=True,
    color=['#FA643F', '#FFBCAB', '#CBB2B2', '#F1F1F1', '#A3C9A8'])

for p in ax.patches:
    width = p.get_width()
    if width > 0:
        x, y = p.get_xy()
        ax.text(
            x + width / 3,
            y + p.get_height() / 3,
            '{:.0f}%'.format(width),
            ha='center',
            va='center'
        )


In [None]:
df

In [None]:
# Treat NaN as equal by using keep=False and na=False trick
df[df.duplicated(keep=False)]

In [None]:
df.duplicated(keep=False)

In [None]:
df['fare_amount'] = pd.to_numeric(df['fare_amount'], errors='coerce')
clean_fare = df['fare_amount'].dropna()

sm.qqplot(clean_fare, line='45')
plt.title('Q-Q Plot of Fare Amount')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Sample Quantiles')
plt.grid(True) # Added grid for better visualization
plt.show()

In [None]:
card_sample = df[df['payment_type']=='Card']['fare_amount']
cash_sample = df[df['payment_type']=='Cash']['fare_amount']

In [None]:
# prompt: df['fare_amount'] = pd.to_numeric(df['fare_amount'], errors='coerce')
# clean_fare = df['fare_amount'].dropna()
# sm.qqplot(clean_fare, line='45')
# plt.title('Q-Q Plot of Fare Amount')
# plt.xlabel('Theoretical Quantiles')
# plt.ylabel('Sample Quantiles')
# plt.grid(True) # Added grid for better visualization
# plt.show()
# correct these code

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import scipy.stats as st
import warnings

warnings.filterwarnings('ignore')

df = pd.read_csv("/content/2023_Yellow_Taxi_Trip_dataset.xls")

# ... (rest of your existing code)

df['fare_amount'] = pd.to_numeric(df['fare_amount'], errors='coerce')
clean_fare = df['fare_amount'].dropna()

# Correct the QQ plot code to handle potential errors gracefully
try:
    sm.qqplot(clean_fare, line='45')
    plt.title('Q-Q Plot of Fare Amount')
    plt.xlabel('Theoretical Quantiles')
    plt.ylabel('Sample Quantiles')
    plt.grid(True)
    plt.show()
except Exception as e:
    print(f"An error occurred during QQ plot generation: {e}")
    # Add further error handling or debugging steps if needed
    # For instance, check the data type of 'clean_fare'

card_sample = df[df['payment_type']=='Card']['fare_amount'].dropna()
cash_sample = df[df['payment_type']=='Cash']['fare_amount'].dropna()
# ... (rest of your existing code)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import scipy.stats as st
import warnings

warnings.filterwarnings('ignore')

df = pd.read_csv("/content/2023_Yellow_Taxi_Trip_dataset.xls")

# ... (rest of your existing code)

df['fare_amount'] = pd.to_numeric(df['fare_amount'], errors='coerce')
clean_fare = df['fare_amount'].dropna()

# Correct the QQ plot code to handle potential errors gracefully
try:
    sm.qqplot(clean_fare, line='45')
    plt.title('Q-Q Plot of Fare Amount')
    plt.xlabel('Theoretical Quantiles')
    plt.ylabel('Sample Quantiles')
    plt.grid(True)
    plt.show()
except Exception as e:
    print(f"An error occurred during QQ plot generation: {e}")
    # Add further error handling or debugging steps if needed
    # For instance, check the data type of 'clean_fare'

card_sample = df[df['payment_type']=='Card']['fare_amount'].dropna()
cash_sample = df[df['payment_type']=='Cash']['fare_amount'].dropna()
# ... (rest of your existing code)

# Question:1 What libraries are used in the notebook for data analysis and visualization?

Answer: The notebook uses the following libraries:


*   pandas for data manipulation.
*   seaborn for enhanced visualizations.

*   statsmodels.api for statistical models (like the Q-Q plot).
*   scipy.stats for statistical tests (like the t-test).

*   warnings to manage warning messages.

# Question:2 What preprocessing steps are applied to the duration column?

Answer:
*  The tpep_pickup_datetime and tpep_dropoff_datetime columns are
   converted to datetime objects.
*  The duration is calculated by subtracting tpep_pickup_datetime  
   from tpep_dropoff_datetime.
*  The duration is converted to minutes.
*  Outliers are removed from the duration column.



# Question:3 How does the notebook handle missing values?

Answer: The notebook handles missing values by using the dropna() method to remove rows containing missing values.

# Question:4  What kind of plot is used to visualize the distribution of fare_amount for different payment_type values?

Answer: The notebook uses a histogram to visualize the distribution of fare_amount for 'Card' and 'Cash' payment types.

# Question:5 What statistical test is performed to compare the fare_amount between 'Card' and 'Cash' payment types? What does the result indicate?

Answer:
*  The notebook performs an independent samples t-test (using scipy.stats.ttest_ind) to compare the fare_amount for 'Card' and 'Cash' payment types.
*  The result will show a t-statistic and a p-value. The p-value indicates the statistical significance of the difference in fare amounts between the two payment types. If the p-value is below a chosen significance level (e.g., 0.05), it suggests a statistically significant difference.