In [4]:
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split


In [5]:
# Load the Iris dataset
iris = datasets.load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.DataFrame(iris.target, columns=['Target'])

In [None]:
# Display first few rows
print("First 5 rows of data:")
print(X.head())


First 5 rows of data:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2


In [10]:
print(y.head())

   Target
0       0
1       0
2       0
3       0
4       0


In [None]:
# Split data into training and test sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
random_state=42)

In [None]:
# Display results
print(f"\nTotal Samples: {len(X)}")
print(f"Training Samples: {len(X_train)}")
print(f"Test Samples: {len(X_test)}")


Total Samples: 150
Training Samples: 105
Test Samples: 45


AICS Practical 4

ship data

In [11]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [12]:
train_data = pd.read_csv('/content/train.csv')
test_data = pd.read_csv('/content/test.csv')


In [13]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


embarked C->0 , Q->1, S->2

In [14]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [15]:
print("Train Data Shape:", train_data.shape)
print("Test Data Shape:", test_data.shape)

Train Data Shape: (891, 12)
Test Data Shape: (418, 11)


In [16]:
# Combine Train and Test for Consistent Feature Engineering
data = pd.concat([train_data, test_data], axis=0, ignore_index=True)

In [17]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [20]:
data.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,418
Pclass,0
Name,0
Sex,0
Age,263
SibSp,0
Parch,0
Ticket,0
Fare,1


In [21]:
# 1. Handle Missing Values (avoid inplace=True with chained assignment)
data['Age'] = data['Age'].fillna(data['Age'].median())
data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])
data['Fare'] = data['Fare'].fillna(data['Fare'].median())

In [22]:
# 2. Drop Columns with Excessive Missing Data
data = data.drop(['Cabin', 'Ticket', 'Name'], axis=1)

In [23]:
# 3. Encode Categorical Features
label_encoder = LabelEncoder()
data['Sex'] = label_encoder.fit_transform(data['Sex'])
data['Embarked'] = label_encoder.fit_transform(data['Embarked'])


In [24]:
# 4. Feature Scaling
scaler = StandardScaler()
data[['Age', 'Fare']] = scaler.fit_transform(data[['Age', 'Fare']])


In [25]:
# 5. Create New Features
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
data['IsAlone'] = (data['FamilySize'] == 1).astype(int)

In [26]:
print("Feature Engineering Completed.")
print(data.head())

Feature Engineering Completed.
   PassengerId  Survived  Pclass  Sex       Age  SibSp  Parch      Fare  \
0            1       0.0       3    1 -0.581628      1      0 -0.503291   
1            2       1.0       1    0  0.658652      1      0  0.734744   
2            3       1.0       3    0 -0.271558      0      0 -0.490240   
3            4       1.0       1    0  0.426099      1      0  0.383183   
4            5       0.0       3    1  0.426099      0      0 -0.487824   

   Embarked  FamilySize  IsAlone  
0         2           2        0  
1         0           2        0  
2         2           1        1  
3         2           2        0  
4         2           1        1  


In [27]:
# Split back to train and test
train_final = data[:len(train_data)].copy()
test_final = data[len(train_data):].copy()

In [28]:
print("Final Train Shape:", train_final.shape)
print("Final Test Shape:", test_final.shape)

Final Train Shape: (891, 11)
Final Test Shape: (418, 11)
