In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Step 1: Load Dataset
df = pd.read_csv("std_marks_data.csv")

# Step 2: Explore Data
print("First 5 rows of dataset:\n", df.head())
print("\nMissing values in each column:\n", df.isnull().sum())

# Step 3: Handle Missing Values (Replace NaN with Column Mean)
df.fillna(df.mean(numeric_only=True), inplace=True)

# Step 4: Encode Categorical Variables (if 'internet' is categorical)
if df["internet"].dtype == "object":
    df["internet"] = df["internet"].map({"yes": 1, "no": 0})

# Step 5: Segregate Input (X) and Output (y)
X = df[["hours", "age", "internet"]]  # Independent variables
y = df["marks"]  # Dependent variable

# Step 6: Split Data (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Train Model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 8: Evaluate Model
y_pred = model.predict(X_test)

print("\nModel Performance:")
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

# Step 9: Test with New Input Data
new_data = np.array([[5, 18, 1]])  # Example: 5 hours, 18 years old, internet available (1)
predicted_marks = model.predict(new_data)

print("\nPredicted Marks for New Data:", predicted_marks[0])


First 5 rows of dataset:
    hours  age  internet  marks
0   6.84   15         0  78.64
1   6.56   20         1  88.80
2    NaN   21         1  88.90
3   8.67   22         1  98.99
4   7.55   17         1  92.34

Missing values in each column:
 hours       12
age          0
internet     0
marks        0
dtype: int64

Model Performance:
Mean Absolute Error: 15.093878830955454
Mean Squared Error: 296.9278739472545
R² Score: 0.04539029325368027

Predicted Marks for New Data: 74.11327579029636




In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression

In [2]:
# Load dataset
df = pd.read_csv('std_marks_data.csv')

# Display first 5 rows and basic info
print("First 5 rows:")
print(df.head())
print("\nDataset info:")
print(df.info())

First 5 rows:
   hours  age  internet  marks
0   6.84   15         0  78.64
1   6.56   20         1  88.80
2    NaN   21         1  88.90
3   8.67   22         1  98.99
4   7.55   17         1  92.34

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   hours     288 non-null    float64
 1   age       300 non-null    int64  
 2   internet  300 non-null    int64  
 3   marks     300 non-null    float64
dtypes: float64(2), int64(2)
memory usage: 9.5 KB
None


In [10]:
df.isnull()

Unnamed: 0,hours,age,internet,marks
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
295,False,False,False,False
296,False,False,False,False
297,False,False,False,False
298,False,False,False,False


In [12]:
# Check missing values
print("\nMissing values before preprocessing:")
print(df.isnull().sum())

# Fill NaN with mean (for numeric columns only)
numeric_cols = ['hours', 'age', 'marks']
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# For 'internet' (binary: 0/1), fill with mode (most frequent value)
df['internet'] = df['internet'].fillna(df['internet'].mode()[0])

# Verify no missing values remain
print("\nMissing values after preprocessing:")
print(df.isnull().sum())


Missing values before preprocessing:
hours       0
age         0
internet    0
marks       0
dtype: int64

Missing values after preprocessing:
hours       0
age         0
internet    0
marks       0
dtype: int64


In [14]:
# Input features (X) and target (y)
X = df[['hours', 'age', 'internet']]  # Predictors
y = df['marks']                        # Target variable

In [15]:
# Initialize and train the model
model = LinearRegression()
model.fit(X, y)

# Get coefficients and intercept
coefficients = model.coef_
intercept = model.intercept_

print("\nModel Parameters:")
print(f"Coefficients (hours, age, internet): {coefficients}")
print(f"Intercept: {intercept:.2f}")


Model Parameters:
Coefficients (hours, age, internet): [1.25998261 0.32935904 3.11097722]
Intercept: 57.95


In [13]:
# Calculate R² score (accuracy)
r_squared = model.score(X, y)
print(f"\nCoefficient of Determination (R²): {r_squared:.4f}")


Coefficient of Determination (R²): 0.0455


In [20]:
# Create new input data for prediction
new_data = pd.DataFrame({
    'hours': [3, 5, 4],    # Hours studied
    'age': [17, 20, 19],   # Student age
    'internet': [1, 0, 1]  # Internet access (1=yes, 0=no)
})

# Predict marks
predicted_marks = model.predict(new_data)
new_data['predicted_marks'] = predicted_marks.round(2)

print("\nPredictions for New Data:")
print(new_data)


Predictions for New Data:
   hours  age  internet  predicted_marks
0      3   17         1            70.44
1      5   20         0            70.83
2      4   19         1            72.35


In [22]:

# Segregate Input (X) and Output (y)
X = df[["hours", "age", "internet"]]
y = df["marks"]

# Split Data (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on Test Data
y_pred = model.predict(X_test)

In [31]:
print("Accuracy of training data",model.score(X_train ,y_train))
print("Accuracy of testing data",model.score(X_test ,y_test))

Accuracy of training data 0.03426128260168593
Accuracy of testing data 0.04539029325368027


In [24]:
print(X_train)

     hours  age  internet
232   6.56   22         0
59    8.76   16         1
6     8.76   24         0
185   4.80   18         0
173   7.44   24         1
..     ...  ...       ...
188   9.33   15         0
71    5.67   17         1
106   8.80   15         1
270   2.99   21         0
102   6.56   16         1

[240 rows x 3 columns]


In [26]:
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on Test Data
y_pred = model.predict(X_test)