<a href="https://colab.research.google.com/github/TaskoudisDimi/Computational-Intelligence-and-Statistical-Learning/blob/master/Trips.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Data**

In [None]:
import pandas as pd
import numpy as np
import zipfile
import os
import matplotlib.pyplot as plt



# Colab
from google.colab import drive
drive.mount('/content/drive')

zip_file_path = "/content/drive/MyDrive/Programming/AI/Datasets/Trips/Trips.zip"
destination = "/content/drive/MyDrive/Programming/AI/Datasets/Trips/"

with zipfile.ZipFile(zip_file_path, 'r') as zip:
  zip.extractall(destination)


data_dir = "/content/drive/MyDrive/Programming/AI/Datasets/Trips/Trips/"
columns_file = "/content/drive/MyDrive/Programming/AI/Datasets/Trips/columns.txt"

# Correct the column names in columns_to_keep
columns_to_keep = ['Trip_Num', 'Start_Date', 'Start_Lat', 'Start_Lon', 'Start_PostalCode',
                   'End_Date', 'End_Lat', 'End_Lon', 'End_PostalCode', 'Trip_Completed', 'age']



In [None]:



def ProccessingData(data_dir, columns_file):

    # Create an empty list to store the DataFrames
    dataframes = []
    # Get a list of all files in the folder
    file_list = os.listdir(data_dir)

    # Read and clean the columns
    with open(columns_file, 'r') as columns:
        column_names = [line.strip(' [ ] ,\n') for line in columns]
        print(column_names)

    # Loop through the files, read CSVs, and append DataFrames to the list
    for file in file_list:
        if file.endswith('.csv'):
            file_path = os.path.join(data_dir, file)  # Full path to the CSV file
            #Read each file
            df = pd.read_csv(file_path)
            #Set the columns to the data
            df.columns.values[0:] = column_names
            dataframes.append(df)

    # Concatenate the DataFrames vertically to append rows
    combined_df = pd.concat(dataframes, axis=0, ignore_index=True)

    data = combined_df[columns_to_keep]
    print(data)
    rows_with_null_values = data[data.isnull().any(axis=1)]


    # Replaces missing values with 0 in-place
    data.fillna(0,inplace=True)

    # Keep only trips with Trip_Completed = 1
    dataTrips = data[data['Trip_Completed'] == 1]

    # Drop the column Trip_Completed
    dataTrips.drop(columns=['Trip_Completed'], inplace=True)

    # Check for rows with a string value in Column2 and set Column1 to 0
    for index, row in dataTrips.iterrows():
        if isinstance(row['Start_PostalCode'], str):
            dataTrips.at[index, 'Start_PostalCode'] = 0

    # Data Preprocessing
    dataTrips['Start_PostalCode'] = dataTrips['Start_PostalCode'].astype(int)  # Convert to int

    # Replace the not int values
    for index, row in dataTrips.iterrows():
        try:
            int_value = int(row['Start_PostalCode'])
        except ValueError:
            # If it's not an integer, set Column1 to 0
            dataTrips.at[index, 'Start_PostalCode'] = 0

    # Convert DateTime to Date format
    dataTrips['Start_Date'] = pd.to_datetime(dataTrips['Start_Date'], errors='coerce')

    # Drop rows with missing or invalid dates
    dataTrips.dropna(subset=['Start_Date'], inplace=True)

    # Feature Engineering
    dataTrips['Hour'] = dataTrips['Start_Date'].dt.hour
    dataTrips['Day'] = dataTrips['Start_Date'].dt.day
    dataTrips['Month'] = dataTrips['Start_Date'].dt.month
    dataTrips['Year'] = dataTrips['Start_Date'].dt.year

    dataTrips['pickups'] = 1
    dataTrips=dataTrips.groupby(['Start_Date', 'Start_PostalCode', 'Day', 'Month','Year' ])['pickups'].sum().reset_index()

    dataTrips = dataTrips[dataTrips['Start_PostalCode'] != 0]
    return dataTrips



In [None]:

data = ProccessingData(data_dir, columns_file)


In [None]:

def save(data_to_save):
    # Specify the file path where you want to save the CSV file
    file_path = '/content/drive/MyDrive/Programming/AI/Datasets/Trips/output_data.csv'

    # Write the data to a CSV file
    data_to_save.to_csv(file_path, index=False)


In [None]:
data = save(data)

In [None]:


print(data.shape)
print(list(data.columns))
print(data.sample())
print(data.info())
print(data.describe())

## **Regression**

In [None]:

## Build AI Model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

X = dataTrips[['Start_PostalCode', 'Day', 'Month', 'Year']]
y = dataTrips['pickups']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=30, random_state=1)

# model = LinearRegression()
# model.fit(X_train, y_train)

# y_pred = model.predict(X_test)

# # Assuming y_test contains the actual values and y_pred contains the predicted values
# mae = mean_absolute_error(y_test, y_pred)
# mse = mean_squared_error(y_test, y_pred)
# rmse = mean_squared_error(y_test, y_pred, squared=False)  # Pass squared=False to get RMSE
# r2 = r2_score(y_test, y_pred)

# print(f"Mean Absolute Error (MAE): {mae}")
# print(f"Mean Squared Error (MSE): {mse}")
# print(f"Root Mean Squared Error (RMSE): {rmse}")
# print(f"R-squared (R2) Score: {r2}")


# Create and Train the Random Forest Regressor Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make Predictions
y_pred = rf_model.predict(X_test)

# Evaluate the Model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2) Score: {r2}")

Mean Absolute Error (MAE): 0.39228714225168754
Mean Squared Error (MSE): 0.6262850038165251
Root Mean Squared Error (RMSE): 0.7913817055103846
R-squared (R2) Score: 0.22680863726354927


In [None]:
print(max(y))
print(max(y_test))

17
5


In [None]:
import matplotlib.pyplot as plt

# Plot actual vs. predicted values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.5)  # Scatter plot

plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs. Predicted Values")

# Add a diagonal line for reference (perfect prediction)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], linestyle='--', color='red', linewidth=2)

plt.show()

In [None]:

# Make Predictions for Future Days (adjust X_future accordingly)
X_future = pd.DataFrame({
    'Start_PostalCode': 11692,
    'Day': 19,
    'Month': 11,
    'Year': 2025
}, index=[0])


future_pickups = rf_model.predict(X_future)
print(f"Predicted Pickups for Future Day: {future_pickups}")

Predicted Pickups for Future Day: [1.14948413]


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor

regk_grid = GridSearchCV(KNeighborsRegressor(),
                       param_grid={"n_neighbors":np.arange(2,4),
                                  "leaf_size":np.arange(15,25)},
                       cv = 2,
                       scoring = "neg_mean_absolute_error")

# Train the model with the dataset
regk_grid.fit(X_train,y_train)

# Do predictions
# regk_grid_y_pred = regk_grid.predict(X_val)
regk_grid.best_params_





{'leaf_size': 15, 'n_neighbors': 3}

In [None]:

# Make Predictions for Future Days (adjust X_future accordingly)
X_future = pd.DataFrame({
    'Start_PostalCode': 11435,
    'Day': 1,
    'Month': 1,
    'Year': 2025
}, index=[0])


future_pickups = regk_grid.predict(X_future)
print(f"Predicted Pickups for Future Day: {future_pickups}")

Predicted Pickups for Future Day: [1.]


In [None]:
import matplotlib.pyplot as plt

# Plot actual vs. predicted values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.5)  # Scatter plot

plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs. Predicted Values")

# Add a diagonal line for reference (perfect prediction)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], linestyle='--', color='red', linewidth=2)

plt.show()

In [None]:
import matplotlib.pyplot as plt

# Plot actual vs. predicted values
plt.figure(figsize=(8, 6))
plt.bar(y_test, y_pred, alpha=0.5)  # Scatter plot

plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs. Predicted Values")

# Add a diagonal line for reference (perfect prediction)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], linestyle='--', color='red', linewidth=2)

plt.show()

In [None]:
print(max(y_pred))
print(max(y_test))

2.748392524142525
2
