Attach my dataset file with google colab

In [4]:
from google.colab import files
import numpy as np

# Upload the file
uploaded = files.upload()

# Verify the upload
for filename in uploaded.keys():
    print(f'User uploaded file "{filename}" with length {len(uploaded[filename])} bytes')

# Assuming the uploaded file is named 'dataset.csv'
data = np.genfromtxt('Dataset.csv', delimiter=',', skip_header=1)




Saving Dataset.csv to Dataset.csv
User uploaded file "Dataset.csv" with length 46831 bytes


Just assigning datatypes to columns for calculations

In [9]:
import numpy as np

# Define the data types for each column. Here 'U10' translates to "Unicode string of maximum length 10," 'i4'
#translates to "4-byte (i.e., 32 bit) integer," and 'f8' translates to "8-byte (i.e., 64 bit) float
dtype = [('age', 'f8'), ('sex', 'U10'), ('bmi', 'f8'), ('children', 'i4'), ('smoker', 'U10'), ('region', 'U10'), ('charges', 'f8')]


Load the file into numpy array

In [10]:
import numpy as np
data = np.genfromtxt('Dataset.csv', delimiter=',', skip_header=1, dtype=dtype, encoding=None)


**Basic Statistics**

Calculate mean, median, and standard deviation for numerical columns.

In [11]:
ages = data['age'].astype(float)
bmis = data['bmi'].astype(float)
charges = data['charges'].astype(float)

# Calculate mean, median, and standard deviation
mean_age = np.mean(ages)
median_age = np.median(ages)
std_age = np.std(ages)

mean_bmi = np.mean(bmis)
median_bmi = np.median(bmis)
std_bmi = np.std(bmis)

mean_charges = np.mean(charges)
median_charges = np.median(charges)
std_charges = np.std(charges)

# Step 4: Print results
print(f"Mean Age: {mean_age}, Median Age: {median_age}, Standard Deviation Age: {std_age}")
print(f"Mean BMI: {mean_bmi}, Median BMI: {median_bmi}, Standard Deviation BMI: {std_bmi}")
print(f"Mean Charges: {mean_charges}, Median Charges: {median_charges}, Standard Deviation Charges: {std_charges}")

Mean Age: 39.566992014196984, Median Age: 40.0, Standard Deviation Age: 13.998310587094037
Mean BMI: 30.746597160603365, Median BMI: 30.495, Standard Deviation BMI: 6.077269278683908
Mean Charges: 13267.379372598936, Median Charges: 9487.6442, Standard Deviation Charges: 12033.899980810114


In this code i filter out the persons who smokes and who does not smoker

In [None]:

smokers = data[data['smoker'] == 'yes']
non_smokers = data[data['smoker'] == 'no']

# Count the number of smokers and non-smokers
num_smokers = smokers.shape[0]
num_non_smokers = non_smokers.shape[0]

print(f"Number of Smokers: {num_smokers}")
print(f"Number of Non-Smokers: {num_non_smokers}")



Number of Smokers: 222
Number of Non-Smokers: 905


**ELEMENT-WISE OPERATIONS**
In this code all BMI values will be increased by 2%

In [None]:
bmi = data['bmi']
increased_bmi = bmi * 1.02

print(f"Original BMI: {bmi[:5]}")  # Print first 5 values
print(f"Increased BMI: {increased_bmi[:5]}")  # Print first 5 values

Original BMI: [27.9   33.77  33.    22.705 28.88 ]
Increased BMI: [28.458  34.4454 33.66   23.1591 29.4576]


Minimum and maximum charges

In [13]:
max_charges = np.max(charges)
min_charges = np.min(charges)

print(f"Maximum Charges: {max_charges}")
print(f"Minimum Charges: {min_charges}")


Maximum Charges: 63770.42801
Minimum Charges: 1121.8739


**Group by Region and Calculate Average Charges:**

In [7]:
regions = np.unique(data['region'])
for region in regions:
    region_data = data[data['region'] == region]
    avg_charges_region = np.mean(region_data['charges'])
    print(f"Average Charges in {region}: {avg_charges_region}")


Average Charges in northeast: 13948.890969107408
Average Charges in northwest: 12491.738191607143
Average Charges in southeast: 14572.76210632107
Average Charges in southwest: 11982.710172014387


**Apply some slicing concepts to conclude the numpy here**

In [16]:
import numpy as np



# Extract some rows
specific_rows = data[400:410]
print("Rows between 400 to 410:")
print(specific_rows)

# Extract rows where the age is greater than 50
age_above_50 = data[data['age'] > 50]
print("\nRows where age is greater than 50:")
print(age_above_50)

# Extract charges column for the 400 to 410 rows
charges = data['charges'][400:410]
print("\nCharges for the 400 to 410 rows:")
print(charges)

# Extract ages and BMIs for all smokers
smokers_ages_bmis = data[data['smoker'] == 'yes'][['age', 'bmi']]
print("\nAges and BMIs for smokers:")
print(smokers_ages_bmis)

# Extract all rows where BMI is between 25 and 30
bmi_between_25_30 = data[(data['bmi'] >= 25) & (data['bmi'] <= 30)]
print("\nRows where BMI is between 25 and 30:")
print(bmi_between_25_30)

# Extract ages for non-smokers in the first 50 rows
ages_non_smokers_first_50 = data[data['smoker'] == 'no']['age'][:50]
print("\nAges for non-smokers in the first 50 rows:")
print(ages_non_smokers_first_50)


Rows between 400 to 410:
[(54., 'female', 21.47, 3, 'no', 'northwest', 12475.3513 )
 (19., 'male', 28.7 , 0, 'no', 'southwest',  1253.936  )
 (44., 'female', 38.06, 0, 'yes', 'southeast', 48885.13561)
 (53., 'male', 31.16, 1, 'no', 'northwest', 10461.9794 )
 (19., 'female', 32.9 , 0, 'no', 'southwest',  1748.774  )
 (61., 'female', 25.08, 0, 'no', 'southeast', 24513.09126)
 (18., 'female', 25.08, 0, 'no', 'northeast',  2196.4732 )
 (61., 'male', 43.4 , 0, 'no', 'southwest', 12574.049  )
 (21., 'male', 25.7 , 4, 'yes', 'southwest', 17942.106  )
 (20., 'male', 27.93, 0, 'no', 'northeast',  1967.0227 )]

Rows where age is greater than 50:
[(59., 'female', 27.72 , 3, 'no', 'southeast', 14001.1338 )
 (63., 'female', 23.085, 0, 'no', 'northeast', 14451.83515)
 (55., 'female', 32.775, 2, 'no', 'northwest', 12268.63225)
 (63., 'male', 28.31 , 0, 'no', 'northwest', 13770.0979 )
 (62., 'female', 32.965, 3, 'no', 'northwest', 15612.19335)
 (60., 'male', 39.9  , 0, 'yes', 'southwest', 48173.361  )