# Step 1: Install the necessary packages

In [None]:
!pip install --upgrade pip

In [None]:
!pip install -r requirements.txt

# Step 2: Ingest the Raw Dataset and perform EDA

In [None]:
import boto3
import pandas as pd
from io import StringIO

# Initialize the S3 client
s3 = boto3.client('s3')

# Specify the S3 bucket and file key (path to the CSV file)
bucket_name = 'awssagemaker-xgboost'
file_key = 'raw/Life_Expectancy_Data.csv'

# Read the CSV file from S3
response = s3.get_object(Bucket=bucket_name, Key=file_key)
csv_content = response['Body'].read().decode('utf-8')

df = pd.read_csv(StringIO(csv_content))

# Display the DataFrame
print(df)

In [None]:
# Display the feature columns, rows and data types
df.info()


In [None]:
# Check if any missing values are present in the dataframe
df.isnull().sum()

In [None]:
# Plottig the Null values
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px # Interactive Data Visualization
sns.heatmap(df.isnull(), yticklabels = False, cbar = False, cmap="Blues")
plt.show()

# Step 3: Feature Engineering

In [None]:
# Droppimg the null rows
df = df.dropna()

In [None]:
df.isnull().sum()

In [None]:
# check if there are any Null values
sns.heatmap(df.isnull(), yticklabels = False, cbar = False, cmap="Blues")
plt.show()

In [None]:
# Perform one-hot encoding
life_expectancy_df = pd.get_dummies(df, columns = ['Status'])

In [None]:
# Plot the histogram
df.hist(bins = 30, figsize = (20,20), color = 'r');
plt.show()

In [None]:
# Plot the correlation matrix

# Select only numeric columns
numeric_df = df.select_dtypes(include=[float, int])

# Calculate the correlation matrix
corr_matrix = numeric_df.corr()

# Plot the correlation matrix
plt.figure(figsize=(20, 20))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.show()

In [None]:
df.info()

In [None]:

from sklearn.preprocessing import LabelEncoder

# Sample DataFrame (replace this with your actual DataFrame)
# df = pd.read_csv('your_data.csv')

# Identify columns with string data types (categorical data)
categorical_columns = df.select_dtypes(include=['object']).columns

# Initialize LabelEncoder
le = LabelEncoder()

# Apply LabelEncoder to each categorical column
for col in categorical_columns:
    df[col] = le.fit_transform(df[col])

df.info()

In [None]:
df.to_csv('cleaned_data.csv', index=False)