In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import seaborn as sns 
import matplotlib.pyplot as plt
from datasist.structdata import detect_outliers 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import pickle

In [None]:
sns.set(rc={"figure.figsize": [6,6]}, font_scale=1.5)

In [None]:
df=pd.read_csv(r"C:\Users\omara\OneDrive\Desktop\career\internship\cellula\week 4\final_internship_data.csv")

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.describe()

In [None]:
df.duplicated().sum()

In [None]:
df.sample(3)

In [None]:
df.dropna(axis=0,inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.columns

In [None]:
df.drop(["key","Driver Name","User Name","User ID"] , axis=1,inplace=True)


In [None]:
df.head()

In [None]:
def haversine (lon_1, lon_2, lat_1, lat_2):
    
    lon_1, lon_2, lat_1, lat_2 = map(np.radians, [lon_1, lon_2, lat_1, lat_2])  
    diff_lon = lon_2 - lon_1
    diff_lat = lat_2 - lat_1
    km = 2 * 6371 * np.arcsin(np.sqrt(np.sin(diff_lat/2.0)**2 + 
                                      np.cos(lat_1) * np.cos(lat_2) * np.sin(diff_lon/2.0)**2))
    return km

In [None]:
df["my distance"] =haversine(df["pickup_longitude"],df["dropoff_longitude"],df["pickup_latitude"],df["dropoff_latitude"])

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.drop(["dropoff_latitude","dropoff_longitude","pickup_latitude","pickup_longitude"] , axis=1,inplace=True)

In [None]:
df.head()

In [None]:
df["pickup_datetime"]=pd.to_datetime(df["pickup_datetime"],format="%Y-%m-%d %H:%M:%S",errors='coerce')

In [None]:
df.info()

In [None]:
df=df.convert_dtypes()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df["min"]=df["pickup_datetime"].dt.minute

In [None]:
df.head()

In [None]:
df.drop(["pickup_datetime"],axis=1,inplace=True)

In [None]:
df.head()

In [None]:
px.pie(df,names=df["Car Condition"],title= "Car condition distribution").update_traces(textinfo="percent")

In [None]:
px.histogram(df, x="Car Condition", y="fare_amount", histfunc="avg", title="Average Fare Amount by Car Condition")

In [None]:
px.histogram(df, x="Car Condition", y=df["distance"], histfunc="avg", title="Average distance by Car Condition")

In [None]:
px.histogram(df, x="Car Condition", y=df["my distance"], histfunc="avg", title="Average distance by Car Condition")

In [None]:
df["Car Condition"].unique()

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(18, 5), sharey=True)
conditions = ['Very Good', 'Excellent', 'Bad', 'Good']

for i, condition in enumerate(conditions):
    sns.boxplot(data=df[df['Car Condition'] == condition],
                y='fare_amount',
                ax=axes[i],
                color='skyblue')
    axes[i].set_title(f"Car Condition: {condition}")
    axes[i].set_xlabel("")  
    axes[i].set_ylabel("Fare Amount ($)" if i == 0 else "") 

plt.tight_layout()
plt.suptitle("Fare Amount Distribution by Car Condition", fontsize=16, y=1.05)
plt.show()


In [None]:
px.histogram(df, x="Car Condition", y=df["passenger_count"], histfunc="avg", title="Average passenger count by Car Condition")

In [None]:
df["Weather"].unique()

In [None]:
px.pie(df,names=df["Weather"],title= "Weather condition distribution").update_traces(textinfo="percent")

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(data=df, x="Car Condition", hue="Weather")
plt.title("Weather Distribution by Car Condition")
plt.ylabel("Number of Rides")
plt.xticks(rotation=30)
plt.show()

In [None]:
plt.figure(figsize=(14,6))
sns.boxplot(data=df, x="Car Condition", y="fare_amount", hue="Weather")
plt.title("Fare Amount Distribution by Car Condition and Weather")
plt.ylabel("Fare Amount")
plt.xticks(rotation=15)
plt.legend(title="Weather", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.show()

In [None]:
df["Traffic Condition"].unique()

In [None]:
px.pie(df,names=df["Traffic Condition"],title= "Traffic condition distribution").update_traces(textinfo="percent")

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(data=df, x="Car Condition", hue="Traffic Condition")
plt.title("Traffic Condition Distribution by Car Condition")
plt.ylabel("Number of Rides")
plt.xticks(rotation=30)
plt.show()

In [None]:
plt.figure(figsize=(14,6))
sns.boxplot(data=df, x="Car Condition", y="fare_amount", hue=df["Traffic Condition"])
plt.title("Fare Amount Distribution by Car Condition and Traffic Condition")
plt.ylabel("Fare Amount")
plt.xticks(rotation=15)
plt.legend(title="Traffic Condition", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.show()

In [None]:
df.head()

In [None]:
px.box(df, x="Weather", y="fare_amount", color="Car Condition",title="Fare Amount Distribution by Weather and Car Condition")


In [None]:
px.box(df, x=df["Traffic Condition"], y="fare_amount", color="Car Condition",title="Fare Amount Distribution by Traffic Condition and Car Condition")


In [None]:
df.head()

In [None]:
sns.scatterplot(df,x=df["passenger_count"],y=df["fare_amount"])
plt.title("Passenger Count By Fare Amount")

In [None]:
sns.scatterplot(df,x=df["distance"],y=df["fare_amount"])
plt.title("distance By Fare Amount")

In [None]:
sns.scatterplot(df,x=df["my distance"],y=df["fare_amount"])
plt.title("my distance By Fare Amount")

In [None]:
df[df["fare_amount"]<=0].count()

In [None]:
df.drop(df[df["fare_amount"]<=0].index,axis=0,inplace=True)

In [None]:
df[df["distance"]<=0]

In [None]:
df.info()

In [None]:
df.drop(df[df["distance"]<=0].index,axis=0,inplace=True)

In [None]:
df.info()

In [None]:
px.scatter(df,x=df["distance"],y=df["fare_amount"],title="Fare Amount By Distance ")

In [None]:
sns.scatterplot(df,x=df["distance"],y=df["fare_amount"])
plt.title("Fare Amount By Distance ")

In [None]:
sns.scatterplot(df,x=df["my distance"],y=df["fare_amount"])
plt.title("Fare Amount By Distance ")

In [None]:
df[df["my distance"]<=0].count()

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(numeric_only=True), annot=True, fmt=".1f", cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()


In [None]:
px.box(df,x=df["distance"])

In [None]:
px.box(df,x=df["my distance"])

In [None]:
sns.boxplot(df,x=df["fare_amount"])

In [None]:
df[(df["fare_amount"]>60) & (df["distance"]<1) ]

In [None]:
df.describe()

In [None]:
df[df["fare_amount"]==df["fare_amount"].min()]

In [None]:
sns.histplot(df,x=df["fare_amount"])

In [None]:
df[df["fare_amount"]==df["fare_amount"].min()]

In [None]:
df.drop(df[df["fare_amount"]==df["fare_amount"].min()].index, axis=0,inplace=True)

In [None]:
df[df["fare_amount"]==df["fare_amount"].min()]

In [None]:
df[df["distance"]==df["distance"].max()]

In [None]:
df.drop(df[df["fare_amount"]==df["fare_amount"].min()].index, axis=0,inplace=True)

In [None]:
df[df["fare_amount"]==df["fare_amount"].min()]

In [None]:
df.groupby("year")["fare_amount"].mean()


In [None]:
sns.barplot(df,x=df["year"],y=df["fare_amount"])
plt.title("Year by Average Fare Amount")

In [None]:
df.columns

In [None]:
df.drop(["my distance"] , axis=1,inplace=True)

In [None]:
df.sample(3)

In [None]:
df[df["distance"]>50]

In [None]:
df[df["distance"] > 50]["fare_amount"].describe()

In [None]:
df.drop(df[df["distance"] > 50].index,axis=0,inplace=True)

In [None]:
df.describe()

In [None]:
df[df["distance"]<0.1]

In [None]:
df[df["distance"] <0.1]["fare_amount"].describe()

In [None]:
df.drop(df[df["distance"]<0.1].index,axis=0,inplace=True)

In [None]:
df[["distance"]].describe()

In [None]:
df.info()

In [None]:
df[df["fare_amount"]>100]["distance"].describe()

In [None]:
df[(df["fare_amount"]>100 ) & ( df["distance"]<10)]

In [None]:
df.drop(df[(df["fare_amount"]>100 ) & ( df["distance"]<10)].index,axis=0,inplace=True)

In [None]:
df[df["fare_amount"]>100]["distance"].describe()

In [None]:
df[(df["fare_amount"]<10 ) & ( df["distance"]>20)]

In [None]:
df[(df["fare_amount"]<10 ) & ( df["distance"]>20)].describe()

In [None]:
df[df["fare_amount"]<10]["distance"].describe()

In [None]:
df.drop(df[(df["fare_amount"]<10 ) & ( df["distance"]>20)].index,axis=0,inplace=True)

In [None]:
df[df["fare_amount"]<10]["distance"].describe()

In [None]:
sns.scatterplot(df,x=df["distance"],y=df["fare_amount"])
plt.title("Fare Amount By Distance ")