In [None]:
!wget https://raw.githubusercontent.com/The-CEAS-Library/Data-Analysis-with-Python-Linear-Regression/main/auto.csv

In [None]:
import pandas as pd
import numpy as np

automobile = pd.read_csv('auto.csv')

# Replace any missing values with NaNs
automobile.replace('?', np.nan, inplace = True)

# Drop the NaNs, reset the index and drop the unnecessary columns
auto_df = automobile.dropna().reset_index().drop(['index','symboling'],axis=1)

# Change the datatype of a column as per the requirement
auto_df[['price','horsepower']] = auto_df[['price','horsepower']].astype(dtype = 'float64')

##**Exercise 1**

* Create a linear regression model that predicts the **'highway mpg'** of an automobile for a given **'city mpg'**.
  * Use simple linear regression to train the model
  * Evaluate the model by printing the error metrics, R$^2$  value
  * Plot the best fit line between the predicted and test values

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import statsmodels.api as sm


X = auto_df['city mpg'].values.reshape(-1,1)
y = auto_df['highway mpg'].values.reshape(-1,1)


X_train, X_test, y_train, y_test = train_test_split(X, y)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Print intercept and slope:
print("Intercept: ", model.intercept_)
print("Slope: ", model.coef_)

# Predicting the values
y_pred = model.predict(X_test)


# Evaluating the model

print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', mean_squared_error(y_test, y_pred, squared = False))
print('R2 Value:', r2_score(y_test, y_pred))

# Visualizing the training Test Results
plt.scatter(X_train, y_train, color= 'red')
plt.plot(X_test, y_pred, color = 'blue')
plt.title('Predicted Highway MPG vs City MPG ')
plt.ylabel('highway mpg')
plt.xlabel('city mpg')
plt.show()



##**Exercise 2**

* Write a function to establish a relationship between the **'curb weight'** (independent variable) and the **'city mpg'** (dependent variable) of the automobile. This function should take the **'curb weight'**, **'city mpg'** as inputs.
  * Use OLS method of statsmodels.
  * Evaluate the performance of the model by calculating the R$^2$  value, the root mean squared error between the actual and predicted values from the respective models.
  * Plot the best-fit line.



In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

def mpg_prediction(crb_wt, c_mpg):
    # Adding a constant (intercept term) to the independent variable
  x3_train, x3_test, y3_train, y3_test = train_test_split(crb_wt, c_mpg)

  # Create a constant vector in the training dataset
  x3_train_sm = sm.add_constant(x3_train)

  # Choose the model and provide inputs to the model - ordinary least squares(OLS)
  model_stats = sm.OLS(y3_train, x3_train_sm)

  # Fit the model
  results = model_stats.fit()

  results.fittedvalues
  x3_test_sm = sm.add_constant(x3_test)

  # Predict the new outcomes
  y3_pred = results.predict(x3_test_sm)

  # Plotting the best fit line
  plt.scatter(x3_train, y3_train)
  plt.plot(x3_test, y3_pred, color='r')
  plt.xlabel('Curb Weight')
  plt.ylabel('City MPG')
  plt.title('Curb Weight vs City MPG')
  plt.show()

  return print(results.summary())



In [None]:
crb_wt = np.array(auto_df['curb weight']).reshape(-1,1)
c_mpg = auto_df['city mpg']


mpg_prediction(crb_wt, c_mpg)
