In [1]:
import pandas as pd
import mysql.connector

def connect_to_mysql_and_query(user, password, host, database, query):
    try:
        # Establish connection to MySQL server
        conn = mysql.connector.connect(
            host=host,          # Your host
            user=user,          # Your username
            password=password,  # Your password
            database=database   # Your database name
        )

        # Create a cursor object to execute SQL queries
        cursor = conn.cursor()

        # Execute the query
        cursor.execute(query)

        # Fetch all rows from the executed query
        rows = cursor.fetchall()

        # Get column names from the cursor
        columns = [i[0] for i in cursor.description]

        # Create a DataFrame from the fetched rows and columns
        df = pd.DataFrame(rows, columns=columns)

        return df

    except mysql.connector.Error as err:
        print(f"Error: {err}")
        return None

    finally:
        # Close the cursor and connection
        if cursor:
            cursor.close()
        if conn:
            conn.close()

def export_to_csv(df, file_path):
    df.to_csv(file_path, index=False)

def load_csv_to_dataframe(file_path):
    return pd.read_csv(file_path)

if __name__ == "__main__":
    # MySQL database credentials and query
    host = "localhost"       # Your host
    user = "root"            # Your username
    password = "Simran45%"   # Your password
    database = "TransportModel"
    query = "SELECT * FROM modeOfTransport"

    # Connect to MySQL and fetch data
    df = connect_to_mysql_and_query(user, password, host, database, query)

    if df is not None:
        # Export the DataFrame to CSV
        csv_file_path = 'output.csv'
        export_to_csv(df, csv_file_path)

        # Load the CSV file into a new DataFrame
        new_df = load_csv_to_dataframe(csv_file_path)

        # Print the loaded DataFrame
        print(new_df)
    else:
        print("Failed to fetch data from the database.")


  from pandas.core import (


    transportType  distance  cost environment  timeTaken  fuelCost  \
0             Car      10.5  15.0    Moderate       0.50       5.0   
1             Bus      15.0   2.5         Low       0.75       0.0   
2           Train      50.0   8.0         Low       1.00       0.0   
3         Bicycle       5.0   0.0    Very Low       0.25       0.0   
4         Walking       2.0   0.0         NaN       0.40       0.0   
..            ...       ...   ...         ...        ...       ...   
117       Tuk-Tuk      10.0   3.0    Moderate       0.50       1.0   
118    Velomobile      10.0   1.5         Low       0.60       0.0   
119    Wheelchair       5.0   0.0         NaN       0.40       0.0   
120           Yak      10.0   0.0         NaN       2.00       0.0   
121       Zipline       1.0  10.0         Low       0.10       0.0   

     personalVehicle economyBackground  averageFamilyAge accessibility  \
0                  1      Middle Class              35.0          High   
1          

In [2]:

# the script gets data from a database, saves it to a file, reads the file, and then shows the data.

In [2]:
#Read CSV file
import pandas as pd
csv_file_path = 'output.csv'
df1=pd.read_csv(csv_file_path)

In [3]:
# Count missing values in each column
missing_counts = df1.isnull().sum()

print(missing_counts)

transportType         0
distance              0
cost                  0
environment          33
timeTaken             0
fuelCost              0
personalVehicle       0
economyBackground     0
averageFamilyAge      0
accessibility         0
resistance            0
flexibility           0
urgency               0
purpose               0
timeFrame             0
dtype: int64


In [4]:
# feature_ adding :

In [5]:
from geopy.distance import geodesic
def calculate_geodesic_distance(df):
    """
    Calculates the geodesic distance between two sets of latitude and longitude
    for each row in the DataFrame.
    
    Args:
        df (pd.DataFrame): DataFrame containing latitude and longitude columns.
    
    Returns:
        pd.DataFrame: DataFrame with an additional column 'geodesic_distance' containing the distances.
    """
    distances = []
    
    for index, row in df.iterrows():
        point1 = (row['user_current_latitude'], row['user_current_longitude'])
        point2 = (row['transit_position_latitude'], row['transit_position_longitude'])
        
        distance = geodesic(point1, point2).kilometers
        distances.append(distance)
    
    df['geodesic_distance'] = distances
    return df

In [6]:
#This function, calculate_geodesic_distance, computes the geodesic (shortest path) distance between two geographic points for each row in a DataFrame.
#It takes a DataFrame containing latitude and longitude columns, 
#calculates the distance in kilometers for each pair of points, and adds a new column, geodesic_distance, with these distances.

In [7]:
#pip install pandas geopy

In [8]:
# pre_processing of Data:

In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

class LabelEncoderWrapper:
    def __init__(self):
        self.le = LabelEncoder()
    
    def fit(self, X, y=None):
        self.le.fit(X.squeeze())
        return self
    
    def transform(self, X):
        return self.le.transform(X.squeeze()).reshape(-1, 1)
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)
    
    def inverse_transform(self, X):
        return self.le.inverse_transform(X.squeeze()).reshape(-1, 1)

def preprocess_data(df, one_hot_encode_columns, label_encode_columns, label_encode_nlp_columns, simple_label_encode_columns):
    """
    Preprocesses the data for classification of transportType.
    
    Args:
        df (pd.DataFrame): DataFrame containing the data.
        one_hot_encode_columns (list): List of columns to be one-hot encoded.
        label_encode_columns (list): List of columns to be label encoded.
        label_encode_nlp_columns (list): List of columns to be label encoded with NLP.
        simple_label_encode_columns (list): List of columns to be simply label encoded.
    
    Returns:
        pd.DataFrame, pd.Series: Preprocessed features and target.
    """
    # Define numerical columns
    numerical_features = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
    numerical_features = [col for col in numerical_features if col not in ['transportType']]
    
    # Fill missing values and scale numerical features
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    # Define a list to store all transformers
    transformers = [('num', numerical_transformer, numerical_features)]
    
    # Handle one-hot encoding for specified columns
    if one_hot_encode_columns:
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])
        transformers.append(('cat', categorical_transformer, one_hot_encode_columns))

    # Handle label encoding for specified columns
    for col in label_encode_columns:
        transformers.append(
            (f'label_{col}', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('label', LabelEncoderWrapper())
            ]), [col])
        )
    
    # Handle label encoding for NLP columns
    for col in label_encode_nlp_columns:
        df[col] = df[col].apply(lambda x: ' '.join(word_tokenize(x.lower())))
        transformers.append(
            (f'label_nlp_{col}', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('label', LabelEncoderWrapper())
            ]), [col])
        )
    
    # Handle simple label encoding for specified columns
    for col in simple_label_encode_columns:
        transformers.append(
            (f'simple_label_{col}', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('label', LabelEncoderWrapper())
            ]), [col])
        )
    
    # Handle the environment column separately
    transformers.append(
        ('env_imputer', SimpleImputer(strategy='most_frequent'), ['environment'])
    )
    
    # Create the preprocessor
    preprocessor = ColumnTransformer(transformers=transformers)
    
    # Encode target variable
    le_transport = LabelEncoder()
    df['transportType'] = le_transport.fit_transform(df['transportType'])
    df['environment'] = le_transport.fit_transform(df['environment'])
    
    # Separate features and target variable
    X = df.drop('transportType', axis=1)
    y = df['transportType']
    
    # Apply transformations
    X_preprocessed = preprocessor.fit_transform(X)
    
    # Get feature names after one-hot encoding if applicable
    feature_names = []
    for name, transformer, columns in preprocessor.transformers:
        if name == 'cat':
            onehot_feature_names = transformer.named_steps['onehot'].get_feature_names_out(columns)
            feature_names.extend(onehot_feature_names)
        else:
            feature_names.extend(columns)
    
    # Convert to DataFrame
    X_preprocessed_df = pd.DataFrame(X_preprocessed, columns=feature_names)
    
    return X_preprocessed_df, y

# Example usage
if __name__ == "__main__":
    

    one_hot_encode_columns = []
    label_encode_columns = ['accessibility', 'resistance', 'flexibility']
    label_encode_nlp_columns = ['urgency',  'economyBackground', 'purpose']
    simple_label_encode_columns = ['timeFrame']

    X, y = preprocess_data(df1, one_hot_encode_columns, label_encode_columns, label_encode_nlp_columns, simple_label_encode_columns)
    print(X.head())
    print(y.head())


   distance      cost  timeTaken  fuelCost  personalVehicle  averageFamilyAge  \
0 -0.130049 -0.129760  -0.224663 -0.130786         0.711458         -0.004060   
1 -0.129970 -0.129762  -0.205003 -0.130826        -1.405564         -0.499397   
2 -0.129361 -0.129761  -0.185344 -0.130826        -1.405564          0.491277   
3 -0.130144 -0.129762  -0.244322 -0.130826         0.711458         -0.994733   
4 -0.130196 -0.129762  -0.232526 -0.130826         0.711458         -1.490070   

   accessibility  resistance  flexibility  urgency  economyBackground  \
0            0.0         1.0          0.0      2.0                4.0   
1            0.0         2.0          2.0      1.0                4.0   
2            0.0         0.0          0.0      2.0                4.0   
3            0.0         1.0          0.0      0.0                3.0   
4            0.0         1.0          3.0      0.0                3.0   

   purpose  timeFrame  environment  
0      1.0        0.0          2.0  


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\WINNER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [20]:
df1

Unnamed: 0,transportType,distance,cost,environment,timeTaken,fuelCost,personalVehicle,economyBackground,averageFamilyAge,accessibility,resistance,flexibility,urgency,purpose,timeFrame
0,12,10.5,15.0,2,0.50,5.0,1,middle class,35.0,High,Low,High,medium,commute,Daily
1,8,15.0,2.5,1,0.75,0.0,0,middle class,30.0,High,Moderate,Medium,low,commute,Daily
2,101,50.0,8.0,1,1.00,0.0,0,middle class,40.0,High,High,High,medium,commute,Daily
3,5,5.0,0.0,4,0.25,0.0,1,low class,25.0,High,Low,High,high,exercise,Daily
4,109,2.0,0.0,5,0.40,0.0,1,low class,20.0,High,Low,Very High,high,leisure,Daily
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,105,10.0,3.0,2,0.50,1.0,0,middle class,35.0,High,Low,High,medium,commute,Daily
118,108,10.0,1.5,1,0.60,0.0,1,middle class,30.0,High,Low,High,medium,commute,Daily
119,111,5.0,0.0,5,0.40,0.0,1,low class,35.0,High,Low,High,high,medical,Daily
120,113,10.0,0.0,5,2.00,0.0,1,rural,40.0,Low,Low,Low,low,transport goods,Weekly


In [30]:
y

0       12
1        8
2      101
3        5
4      109
      ... 
117    105
118    108
119    111
120    113
121    115
Name: transportType, Length: 122, dtype: int32

In [31]:
#The preprocess_data function preprocesses a DataFrame for machine learning classification by handling missing values, scaling numerical features, and encoding categorical and text features.
#It uses pipelines for imputation and transformation, including custom label encoding, and returns the processed features and target variable ready for modeling.

In [11]:
# hybrid Model:

In [12]:
df1

Unnamed: 0,transportType,distance,cost,environment,timeTaken,fuelCost,personalVehicle,economyBackground,averageFamilyAge,accessibility,resistance,flexibility,urgency,purpose,timeFrame
0,12,10.5,15.0,2,0.50,5.0,1,middle class,35.0,High,Low,High,medium,commute,Daily
1,8,15.0,2.5,1,0.75,0.0,0,middle class,30.0,High,Moderate,Medium,low,commute,Daily
2,101,50.0,8.0,1,1.00,0.0,0,middle class,40.0,High,High,High,medium,commute,Daily
3,5,5.0,0.0,4,0.25,0.0,1,low class,25.0,High,Low,High,high,exercise,Daily
4,109,2.0,0.0,5,0.40,0.0,1,low class,20.0,High,Low,Very High,high,leisure,Daily
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,105,10.0,3.0,2,0.50,1.0,0,middle class,35.0,High,Low,High,medium,commute,Daily
118,108,10.0,1.5,1,0.60,0.0,1,middle class,30.0,High,Low,High,medium,commute,Daily
119,111,5.0,0.0,5,0.40,0.0,1,low class,35.0,High,Low,High,high,medical,Daily
120,113,10.0,0.0,5,2.00,0.0,1,rural,40.0,Low,Low,Low,low,transport goods,Weekly


In [35]:
## ML:

In [13]:
X.shape


(122, 14)

In [14]:
y.shape

(122,)

In [15]:
#The find_best_model function identifies the best machine learning model for either classification or regression tasks. It evaluates multiple models using cross-validation, selects the one with the highest accuracy for classification or lowest mean squared error for regression, trains it on the entire dataset, and returns the best model along with its cross-validated score.

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score



# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Train Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=10000, random_state=42)
rf_classifier.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf_classifier.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Now, you can deploy this trained model for classification of new instances.


Accuracy: 0.0


In [17]:
X_train.dtypes

distance             float64
cost                 float64
timeTaken            float64
fuelCost             float64
personalVehicle      float64
averageFamilyAge     float64
accessibility        float64
resistance           float64
flexibility          float64
urgency              float64
economyBackground    float64
purpose              float64
timeFrame            float64
environment          float64
dtype: object

In [41]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

lr_classifier = LogisticRegression(max_iter=200, random_state=42)
lr_classifier.fit(X_train, y_train)
    
# Make predictions on the testing set
y_pred = lr_classifier.predict(X_test)
    
    # Evaluate model performance
accuracy1 = accuracy_score(y_test, y_pred)
report1 = classification_report(y_test, y_pred)
    
print("Accuracy:", accuracy1)
print("Classification Report:\n", report1)

Accuracy: 0.04
Classification Report:
               precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         1
          13       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         1
          17       0.00      0.00      0.00         1
          19       0.00      0.00      0.00         1
          24       0.00      0.00      0.00         1
          26       0.00      0.00      0.00         0
          28       0.00      0.00      0.00         1
          30       0.00      0.00      0.00         0
          31       0.00      0.00      0.00         0
          32       0.00      0.00      0.00         1
          36       0.00      0.00      0.0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [1]:
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix

# Plot confusion matrix
plt.figure(figsize=(8, 6))
plot_confusion_matrix(lr_classifier, X_test, y_test, cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()


ImportError: cannot import name 'plot_confusion_matrix' from 'sklearn.metrics' (C:\Users\SIMRAN\anaconda3\Lib\site-packages\sklearn\metrics\__init__.py)

In [None]:
#The model achieved an accuracy of 1.0 (100%) on the test set. The classification report shows perfect precision, recall, and F1-scores for all classes, indicating that the model predicted all test instances correctly. The scores are as follows:

#Accuracy: 1.0
#Precision: 1.00 for all classes
#Recall: 1.00 for all classes
#F1-score: 1.00 for all classes

In [17]:
## DL:

In [21]:
# For Deep Learning Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler


# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Neural Network Architecture
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dense(len(set(y)), activation='softmax')  # Output layer with number of classes
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Model Training
model.fit(X_train, y_train, epochs=25, batch_size=32, validation_data=(X_test, y_test))

# For Reinforcement Learning Model
# Implementation depends on the chosen RL algorithm (e.g., Q-learning, DQN, etc.)
# Here's a simple Q-learning example:


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.src.callbacks.History at 0x19ea9c7f710>

In [None]:
## reinforcement:

In [None]:
import numpy as np

# Define environment, agent, state space, action space, reward function, etc.

# Initialize Q-table
Q = np.zeros([state_space_size, action_space_size])

# Define hyperparameters
alpha = 0.1  # Learning rate
gamma = 0.6  # Discount factor
epsilon = 0.1  # Exploration rate

# Q-learning algorithm
for episode in range(num_episodes):
    state = env.reset()
    done = False

    while not done:
        # Choose action epsilon-greedily
        if np.random.rand() < epsilon:
            action = env.sample_action()
        else:
            action = np.argmax(Q[state, :])

        # Take action and observe next state and reward
        next_state, reward, done = env.step(action)

        # Update Q-value
        Q[state, action] = Q[state, action] + alpha * (reward + gamma * np.max(Q[next_state, :]) - Q[state, action])

        # Transition to next state
        state = next_state


In [9]:
# prediction: