In [1]:
import pandas as pd # this acces to the panda library

In [2]:
df=pd.read_csv("insurance.csv") # reads the file

In [3]:
df.head(8) #it display the elements based on given number

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056


In [4]:
df.shape #tells about size  

(1338, 7)

In [5]:
df.info() # information about data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [6]:
df.columns.tolist() # gives the all coloumns names in the data

['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges']

In [7]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [8]:
df["sex"].unique() # gives the types of data available in that coloumn

array(['female', 'male'], dtype=object)

In [9]:
df = df.drop_duplicates() # removes the duplicates
print("No.of rows remaining:", len(df)) # gives the remaining rows after removing

No.of rows remaining: 1337


In [10]:
df.isnull().sum() # Checks the Missing Values

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [11]:
num_cols = df.select_dtypes(include='number').columns   #fills the missing values
for col in num_cols:
    median_val = df[col].median()
    df[col] = df[col].fillna(median_val)
    
cat_cols = df.select_dtypes(include='object').columns    # Fill categorical columns with mode
for col in cat_cols:
    mode_val = df[col].mode()[0]
    df[col] = df[col].fillna(mode_val)

In [12]:
df.count

<bound method DataFrame.count of       age     sex     bmi  children smoker     region      charges
0      19  female  27.900         0    yes  southwest  16884.92400
1      18    male  33.770         1     no  southeast   1725.55230
2      28    male  33.000         3     no  southeast   4449.46200
3      33    male  22.705         0     no  northwest  21984.47061
4      32    male  28.880         0     no  northwest   3866.85520
...   ...     ...     ...       ...    ...        ...          ...
1333   50    male  30.970         3     no  northwest  10600.54830
1334   18  female  31.920         0     no  northeast   2205.98080
1335   18  female  36.850         0     no  southeast   1629.83350
1336   21  female  25.800         0     no  southwest   2007.94500
1337   61  female  29.070         0    yes  northwest  29141.36030

[1337 rows x 7 columns]>

In [13]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1337.0,1337.0,1337.0,1337.0
mean,39.222139,30.663452,1.095737,13279.121487
std,14.044333,6.100468,1.205571,12110.359656
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29,0.0,4746.344
50%,39.0,30.4,1.0,9386.1613
75%,51.0,34.7,2.0,16657.71745
max,64.0,53.13,5.0,63770.42801


In [14]:
df = pd.get_dummies(df, drop_first=True) # One hot encoding , because we dont need order

In [15]:
df.head()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,False,True,False,False,True
1,18,33.77,1,1725.5523,True,False,False,True,False
2,28,33.0,3,4449.462,True,False,False,True,False
3,33,22.705,0,21984.47061,True,False,True,False,False
4,32,28.88,0,3866.8552,True,False,True,False,False


In [16]:
from sklearn.model_selection import train_test_split
X = df.drop('charges', axis=1)
y = df['charges']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
from sklearn.feature_selection import mutual_info_regression
mi_scores = mutual_info_regression(X_train, y_train)
mi_df = pd.DataFrame({
    'Feature': X_train.columns,
    'MI Score': mi_scores
})
mi_df = mi_df.sort_values(by='MI Score', ascending=False)
Important_features = mi_df['Feature'].head(10).tolist()
print("Important_features:",Important_features)

Important_features: ['age', 'smoker_yes', 'children', 'sex_male', 'bmi', 'region_northwest', 'region_southeast', 'region_southwest']


In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print("R² Score:", r2)

R² Score: 0.8069287081198012


In [19]:
X_new = df[Important_features]
X_train_1, X_test_1, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train_1, y_train)

y_pred = model.predict(X_test_1)
r2 = r2_score(y_test, y_pred)
print("R² Score:", r2)

R² Score: 0.8069287081198011


In [23]:
model = RandomForestRegressor(n_estimators=80, random_state=4)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("R² Score:", r2_score(y_test, y_pred))

R² Score: 0.8781667355675041


In [22]:
model = RandomForestRegressor(n_estimators=80, random_state=4)
model.fit(X_train_1, y_train)

y_pred = model.predict(X_test_1)
print("R² Score:", r2_score(y_test, y_pred))

R² Score: 0.8779777715709394
