# Comparison between training Algorithms

# Linear Regression

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score, KFold



# Load your dataset from CSV files
train_data_path = 'C:\\Users\\moham\\Desktop\\Apprentissage auto\\train.csv'
test_data_path = 'C:\\Users\\moham\\Desktop\\Apprentissage auto\\test.csv'


# Assuming your CSV files have a header, if not, set header=None in read_csv
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

# Separate features and target variable
X_train = train_data.drop('Price', axis=1)  # Features
y_train = train_data['Price']  # Target variable

X_test = test_data.drop('Price', axis=1)  # Features
y_test = test_data['Price']  # Target variable

# Handle missing values in the target variable
y_train = y_train.fillna(y_train.mean())  # You can use other imputation strategies

# Define numeric and categorical features
numeric_features = ['Prod. year', 'Cylinders', 'Airbags']
categorical_features = ['Manufacturer', 'Model', 'Category', 'Fuel type', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color', 'Leather interior']

# Create transformers for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline with preprocessor and regressor
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Handle missing values in the target variable for the test set
y_test = y_test.fillna(y_train.mean())  # Use the mean from the training set



# ... (your previous code)

# Define the cross-validation strategy
cv = KFold(n_splits=5, shuffle=True, random_state=42)  # Adjust the number of splits as needed

# Perform cross-validation
cv_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=cv)

# Display the cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean Squared Error (Cross-Validation):", -cv_scores.mean())

# Fit the model
model.fit(X_train, y_train)

# ...


# Make predictions on the test set
predictions = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

# Calculate R-squared
r_squared = r2_score(y_test, predictions)
print(f'R-squared: {r_squared}')





Cross-Validation Scores: [-1.78718199e+09 -9.33407527e+08 -1.80270728e+11 -3.57805653e+09
 -2.23619107e+09]
Mean Squared Error (Cross-Validation): 37761112942.04208
Mean Squared Error: 1574265220.4045472
R-squared: -1.1894811699177326e+32
