In [None]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

In [None]:
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
data = pd.read_csv('housing.csv')
print(data.head(5))


In [None]:
# Step 4: Split the data
X = data.drop('MEDV', axis=1)
y = data['MEDV']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Step 5: Define and train the model
model = Pipeline(steps=[('regressor', LinearRegression())])

model.fit(X_train, y_train)

# Step 6: Evaluate the model
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate RMSE
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print("Linear Regression RMSE (Train):", rmse_train)
print("Linear Regression RMSE (Test):", rmse_test)

# Step 7: Determine if the model is overfitting or underfitting
if rmse_train < rmse_test:
    print("The model may be overfitting.")
    color = 'red'
elif rmse_train > rmse_test:
    print("The model may be underfitting.")
    color = 'blue'
else:
    print("The model seems to have a good fit.")
    color = 'green'

# Step 8: Plot the predicted vs true values for training and testing sets
plt.figure(figsize=(12, 6))

# Plot for training set
plt.subplot(1, 2, 1)
plt.scatter(y_train, y_train_pred, color=color)
plt.plot([min(y_train), max(y_train)], [min(y_train), max(y_train)], color='black', linestyle='--')
plt.title('Training Set: True vs Predicted')
plt.xlabel('True Values')
plt.ylabel('Predicted Values')

# Plot for testing set
plt.subplot(1, 2, 2)
plt.scatter(y_test, y_test_pred, color=color)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='blue', linestyle='--')
plt.title('Testing Set: True vs Predicted')
plt.xlabel('True Values')
plt.ylabel('Predicted Values')

plt.tight_layout()
plt.show()

In [None]:
# Step 1: Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Step 2: Generate synthetic data for demonstration
# Let's create 100 samples with 20 features each
num_samples = 100000
num_features = 55

# Generate random values for the features
X = np.random.rand(num_samples, num_features)


In [None]:
print(X)

In [None]:

# Generate random values for the target variable
y = np.random.rand(num_samples)

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Step 4: Train a simple linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Step 5: Evaluate the model's performance
def calculate_rmse(model, X, y):
    y_pred = model.predict(X)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    return rmse

# Calculate RMSE for both training and testing sets
rmse_train = calculate_rmse(lr, X_train, y_train)
rmse_test = calculate_rmse(lr, X_test, y_test)

print("Linear Regression RMSE (Train):", rmse_train)
print("Linear Regression RMSE (Test):", rmse_test)



# Step 7: Plot the data and the regression line
plt.figure(figsize=(10, 6))
plt.scatter(range(len(y_train)), y_train, color='blue', label='Training data')
plt.scatter(range(len(y_test)), y_test, color='red', label='Testing data')
plt.plot(range(len(y_train)), lr.predict(X_train), color='green', linewidth=2, label='Regression line')
plt.title('Linear Regression - Underfitting Example')
plt.xlabel('Sample Index')
plt.ylabel('Target Variable')
plt.legend()
plt.show()


# Step 6: Check for underfitting condition
if rmse_train <= rmse_test:
    print("The model may be underfitting.")


In [None]:
# Step 1: Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Step 2: Generate synthetic data for demonstration
# Let's create 100 samples with 1 feature
X = np.random.rand(100, 1) * 10  # Feature values between 0 and 10
y = 2 * X[:, 0] + np.random.randn(100)  # Linear relationship with some noise

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train a linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Step 5: Evaluate the model's performance
def calculate_rmse(model, X, y):
    y_pred = model.predict(X)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    return rmse

# Calculate RMSE for both training and testing sets
rmse_train = calculate_rmse(lr, X_train, y_train)
rmse_test = calculate_rmse(lr, X_test, y_test)

print("Linear Regression RMSE (Train):", rmse_train)
print("Linear Regression RMSE (Test):", rmse_test)

# Step 6: Check if the model fits properly
if rmse_train == rmse_test:
    print("The model fits properly.")
else:
    print("The model may not fit properly.")

# Step 7: Plot the data and the regression line
plt.figure(figsize=(10, 6))
plt.scatter(X_train, y_train, color='blue', label='Training data')
plt.scatter(X_test, y_test, color='red', label='Testing data')
plt.plot(X_train, lr.predict(X_train), color='green', linewidth=2, label='Regression line')
plt.title('Linear Regression - Proper Fit Example')
plt.xlabel('Feature (X)')
plt.ylabel('Target Variable (y)')
plt.legend()
plt.show()


In [None]:
# Importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Load the dataset (assuming it's stored in a CSV file)
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
housing_data = pd.read_csv('housing.csv')


# Split data into features (X) and target variable (y)
X = housing_data.drop('MEDV', axis=1)
y = housing_data['MEDV']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Perform Lasso regression
alpha = 0.010  # Regularization parameter
lasso = Lasso(alpha=alpha)
lasso.fit(X_train_scaled, y_train)

# Predict on the testing set
y_pred = lasso.predict(X_test_scaled)

y_pred_t = lasso.predict(X_train_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error test:", mse)

mse = mean_squared_error(y_train, y_pred_t)
print("Mean Squared Error train:", mse)

# Print the coefficients
coefficients = pd.DataFrame({'feature': X.columns, 'coefficient': lasso.coef_})
print(coefficients)
