# Comparison between training Algorithms

# Linear Regression

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score, KFold



# Load your dataset from CSV files
train_data_path = 'C:\\Users\\moham\\Desktop\\Apprentissage auto\\train.csv'
test_data_path = 'C:\\Users\\moham\\Desktop\\Apprentissage auto\\test.csv'


# Assuming your CSV files have a header, if not, set header=None in read_csv
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

# Separate features and target variable
X_train = train_data.drop('Price', axis=1)  # Features
y_train = train_data['Price']  # Target variable

X_test = test_data.drop('Price', axis=1)  # Features
y_test = test_data['Price']  # Target variable

# Handle missing values in the target variable
y_train = y_train.fillna(y_train.mean())  # You can use other imputation strategies

# Define numeric and categorical features
numeric_features = ['Prod. year', 'Cylinders', 'Airbags']
categorical_features = ['Manufacturer', 'Model', 'Category', 'Fuel type', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color', 'Leather interior']

# Create transformers for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline with preprocessor and regressor
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Handle missing values in the target variable for the test set
y_test = y_test.fillna(y_train.mean())  # Use the mean from the training set


# Define the cross-validation strategy
cv = KFold(n_splits=5, shuffle=True, random_state=42)  # Adjust the number of splits as needed

# Perform cross-validation
cv_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=cv)

# Display the cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean Squared Error (Cross-Validation):", -cv_scores.mean())

# Fit the model
model.fit(X_train, y_train)

# ...


# Make predictions on the test set
predictions = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

# Calculate R-squared
r_squared = r2_score(y_test, predictions)
print(f'R-squared: {r_squared}')





Cross-Validation Scores: [-1.78718199e+09 -9.33407527e+08 -1.80270728e+11 -3.57805653e+09
 -2.23619107e+09]
Mean Squared Error (Cross-Validation): 37761112942.04208
Mean Squared Error: 1574265220.4045472
R-squared: -1.1894811699177326e+32


# K nearest Neighbors

In [1]:
from sklearn.neighbors import KNeighborsRegressor

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score, KFold



# Load your dataset from CSV files
train_data_path = 'C:\\Users\\moham\\Desktop\\Apprentissage auto\\train.csv'
test_data_path = 'C:\\Users\\moham\\Desktop\\Apprentissage auto\\test.csv'


# Assuming your CSV files have a header, if not, set header=None in read_csv
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

# Separate features and target variable
X_train = train_data.drop('Price', axis=1)  # Features
y_train = train_data['Price']  # Target variable

X_test = test_data.drop('Price', axis=1)  # Features
y_test = test_data['Price']  # Target variable

# Handle missing values in the target variable
y_train = y_train.fillna(y_train.mean())  # You can use other imputation strategies

# Define numeric and categorical features
numeric_features = ['Prod. year', 'Cylinders', 'Airbags']
categorical_features = ['Manufacturer', 'Model', 'Category', 'Fuel type', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color', 'Leather interior']

# Create transformers for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


# Create a pipeline with preprocessor and KNN regressor
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', KNeighborsRegressor(n_neighbors=5))  # You can adjust the number of neighbors (n_neighbors)
])


# Handle missing values in the target variable for the test set
y_test = y_test.fillna(y_train.mean())  # Use the mean from the training set


# Define the cross-validation strategy
cv = KFold(n_splits=5, shuffle=True, random_state=42)  # Adjust the number of splits as needed

# Perform cross-validation
cv_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=cv)

# Display the cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean Squared Error (Cross-Validation):", -cv_scores.mean())

# Fit the model
model.fit(X_train, y_train)

# ...


# Make predictions on the test set
predictions = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

# Calculate R-squared
r_squared = r2_score(y_test, predictions)
print(f'R-squared: {r_squared}')



Cross-Validation Scores: [-7.33356281e+09 -7.39367758e+09 -1.80168647e+11 -7.30735588e+09
 -7.31765448e+09]
Mean Squared Error (Cross-Validation): 41904179580.14286
Mean Squared Error: 3572969736.3766427
R-squared: -2.699659604379624e+32


# Random Forest 

In [2]:
from sklearn.neighbors import KNeighborsRegressor

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor



# Load your dataset from CSV files
train_data_path = 'C:\\Users\\moham\\Desktop\\Apprentissage auto\\train.csv'
test_data_path = 'C:\\Users\\moham\\Desktop\\Apprentissage auto\\test.csv'


# Assuming your CSV files have a header, if not, set header=None in read_csv
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

# Separate features and target variable
X_train = train_data.drop('Price', axis=1)  # Features
y_train = train_data['Price']  # Target variable

X_test = test_data.drop('Price', axis=1)  # Features
y_test = test_data['Price']  # Target variable

# Handle missing values in the target variable
y_train = y_train.fillna(y_train.mean())  # You can use other imputation strategies

# Define numeric and categorical features
numeric_features = ['Prod. year', 'Cylinders', 'Airbags']
categorical_features = ['Manufacturer', 'Model', 'Category', 'Fuel type', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color', 'Leather interior']

# Create transformers for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))  # You can adjust the number of trees (n_estimators) and other hyperparameters
])


# Handle missing values in the target variable for the test set
y_test = y_test.fillna(y_train.mean())  # Use the mean from the training set


# Define the cross-validation strategy
cv = KFold(n_splits=5, shuffle=True, random_state=42)  # Adjust the number of splits as needed

# Perform cross-validation
cv_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=cv)

# Display the cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean Squared Error (Cross-Validation):", -cv_scores.mean())

# Fit the model
model.fit(X_train, y_train)

# ...


# Make predictions on the test set
predictions = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

# Calculate R-squared
r_squared = r2_score(y_test, predictions)
print(f'R-squared: {r_squared}')



Cross-Validation Scores: [-1.05160231e+08 -1.59157990e+08 -1.80135869e+11 -1.24457187e+11
 -1.96862623e+10]
Mean Squared Error (Cross-Validation): 64908727299.34157
Mean Squared Error: 30871722170.718826
R-squared: -2.332601376759461e+33


# Neural Network

In [None]:
from sklearn.neighbors import KNeighborsRegressor

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor



# Load your dataset from CSV files
train_data_path = 'C:\\Users\\moham\\Desktop\\Apprentissage auto\\train.csv'
test_data_path = 'C:\\Users\\moham\\Desktop\\Apprentissage auto\\test.csv'


# Assuming your CSV files have a header, if not, set header=None in read_csv
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

# Separate features and target variable
X_train = train_data.drop('Price', axis=1)  # Features
y_train = train_data['Price']  # Target variable

X_test = test_data.drop('Price', axis=1)  # Features
y_test = test_data['Price']  # Target variable

# Handle missing values in the target variable
y_train = y_train.fillna(y_train.mean())  # You can use other imputation strategies

# Define numeric and categorical features
numeric_features = ['Prod. year', 'Cylinders', 'Airbags']
categorical_features = ['Manufacturer', 'Model', 'Category', 'Fuel type', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color', 'Leather interior']

# Create transformers for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


# Create a pipeline with preprocessor and MLP regressor
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', MLPRegressor(hidden_layer_sizes=(100, ), max_iter=500, random_state=42))  
    # You can adjust the hidden_layer_sizes, max_iter, and other hyperparameters
])


# Handle missing values in the target variable for the test set
y_test = y_test.fillna(y_train.mean())  # Use the mean from the training set


# Define the cross-validation strategy
cv = KFold(n_splits=5, shuffle=True, random_state=42)  # Adjust the number of splits as needed

# Perform cross-validation
cv_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=cv)

# Display the cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean Squared Error (Cross-Validation):", -cv_scores.mean())

# Fit the model
model.fit(X_train, y_train)

# ...


# Make predictions on the test set
predictions = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

# Calculate R-squared
r_squared = r2_score(y_test, predictions)
print(f'R-squared: {r_squared}')



