In [1]:
### Task 1:
### Import libraries. Load and explore the "titanic.csv" seaborn dataset available at the bottom of this page.
### You can also use sns.load_dataset("titanic") to load the dataset.
import seaborn as sns
import pandas as pd

# Load the Titanic dataset
titanic = sns.load_dataset("titanic")

# Display the first few rows
print(titanic.head())

# Describe the dataset
print(titanic.describe())

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  
         survived      pclass         age       sibsp       parch        fare
count  891.000000  891.000000  714.000000  891.000000  891.000000  891.000000
mean     0.383838    2.308642   29.699118    0.523008    0.3815

In [2]:
#### 1.2 - Groupby and Aggregation Functions: Group by 'sex' and 'class', then calculate the mean and standard deviation of 'age' and 'fare'.
# Group by 'sex' and 'class', then calculate the mean and standard deviation of 'age' and 'fare'
grouped = titanic.groupby(['sex', 'class'], observed=True).agg({
    'age': ['mean', 'std'],
    'fare': ['mean', 'std']
})
print(grouped)

                     age                   fare           
                    mean        std        mean        std
sex    class                                              
female First   34.611765  13.612052  106.125798  74.259988
       Second  28.722973  12.872702   21.970121  10.891796
       Third   21.750000  12.729964   16.118810  11.690314
male   First   41.281386  15.139570   67.226127  77.548021
       Second  30.740707  14.793894   19.741782  14.922235
       Third   26.507589  12.159514   12.661633  11.681696


In [3]:
### 1.3 - Applying Functions: Group by 'sex' and 'class', then apply a custom function to calculate the range of 'age'.
# Custom function to calculate the range
def calc_range(x):
    return x.max() - x.min()

# Group by 'sex' and 'class', then apply the custom function to 'age'
grouped = titanic.groupby(['sex', 'class'], observed=True)['age'].apply(calc_range)

print(grouped)

sex     class 
female  First     61.00
        Second    55.00
        Third     62.25
male    First     79.08
        Second    69.33
        Third     73.58
Name: age, dtype: float64


In [4]:
### 1.4 - Group Transforms: Group by 'sex' and 'class', then calculate the z-score of 'fare' within each group.
from scipy.stats import zscore

# Calculate the z-score of 'fare' within each group
titanic['fare_zscore'] = titanic.groupby(['sex', 'class'], observed=True)['fare'].transform(zscore)

print(titanic.head())

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  fare_zscore  
0    man        True  NaN  Southampton    no  False    -0.463926  
1  woman       False    C    Cherbourg   yes  False    -0.471712  
2  woman       False  NaN  Southampton   yes   True    -0.703352  
3  woman       False    C  Southampton   yes  False    -0.717885  
4    man        True  NaN  Southampton    no   True    -0.395344  


In [5]:
### 1.5 - Iterating Over Groups and Selecting Columns: Iterate over the groups of 'sex' and calculate the mean age for each group.
# Iterate over the groups of 'sex' and calculate the mean age for each group
for sex, group in titanic.groupby('sex'):
    print(f"Sex: {sex}")
    print(group['age'].mean())

Sex: female
27.915708812260537
Sex: male
30.72664459161148


In [6]:
#### 1.6 - Selecting Columns: Group by 'sex' and 'class', then select the 'fare' column and calculate its mean.
# Group by 'sex' and 'class', then select the 'fare' column and calculate its mean
grouped = titanic.groupby(['sex', 'class'], observed=True)['fare'].mean()

print(grouped)

sex     class 
female  First     106.125798
        Second     21.970121
        Third      16.118810
male    First      67.226127
        Second     19.741782
        Third      12.661633
Name: fare, dtype: float64


In [7]:
### 1.7 - Advanced Grouping and Aggregation: Grouping with Dictionary or Series. Group by a dictionary mapping 'class' to 'Deck' levels and calculate the mean fare.
# Dictionary mapping 'class' to 'Deck' levels
deck_mapping = {
    'First': 'A',
    'Second': 'B',
    'Third': 'C'
}

# Group by the dictionary mapping and calculate the mean fare
map_grouping = titanic.groupby(titanic['class'].map(deck_mapping), observed=True)['fare'].mean()

print(map_grouping)

class
A    84.154687
B    20.662183
C    13.675550
Name: fare, dtype: float64


In [8]:
### 1.8 - Grouping with Functions: Group by whether the passenger is a minor (age < 18) and calculate the mean fare.
# Function to determine if a passenger is a minor
def is_minor(age):
    return age < 18

# Group by the function and calculate the mean fare
age_grouped = titanic.groupby(titanic['age'].apply(is_minor))['fare'].mean()

print(age_grouped)

age
False    32.347043
True     31.220798
Name: fare, dtype: float64


In [9]:
### 1.9 - Grouping with Index Levels and Column-Wise: Group by 'sex' and 'class' index levels and calculate the mean age and fare.
# Set multi-level index
titanic.set_index(['sex', 'class'], inplace=True)

# Group by index levels and calculate the mean age and fare
grouped = titanic.groupby(level=['sex', 'class'], observed=True).agg({'age': 'mean', 'fare': 'mean'})

print(grouped)

                     age        fare
sex    class                        
female First   34.611765  106.125798
       Second  28.722973   21.970121
       Third   21.750000   16.118810
male   First   41.281386   67.226127
       Second  30.740707   19.741782
       Third   26.507589   12.661633


In [11]:
### 1.10 - Grouping with Multiple Functions: Group by 'sex' and 'class', then calculate the mean, median, and count of 'age'.
# Group by 'sex' and 'class', then calculate the mean, median, and count of 'age'
sc_grouped = titanic.groupby(['sex', 'class'], observed=True)['age'].agg(['mean', 'median', 'count'])

print(sc_grouped)

                    mean  median  count
sex    class                           
female First   34.611765    35.0     85
       Second  28.722973    28.0     74
       Third   21.750000    21.5    102
male   First   41.281386    40.0    101
       Second  30.740707    30.0     99
       Third   26.507589    25.0    253


In [10]:
### 1.11 - Pivot tables and Crosstabulation: Create a pivot table to calculate the mean fare for each combination of 'sex' and 'class'.
# Create a pivot table to calculate the mean fare for each combination of 'sex' and 'class'
pivot_table = titanic.pivot_table(values='fare', index='sex', columns='class', aggfunc='mean')

print(pivot_table)

class        First     Second      Third
sex                                     
female  106.125798  21.970121  16.118810
male     67.226127  19.741782  12.661633


  pivot_table = titanic.pivot_table(values='fare', index='sex', columns='class', aggfunc='mean')


In [12]:
### 1.12 - Crosstab: Create a crosstab to count the number of passengers for each combination of 'sex' and 'class'.
# Create a crosstab to count the number of passengers for each combination of 'who' and 'sex'
crosstab = pd.crosstab(titanic['who'], titanic['fare'].mean())

print(crosstab)

col_0  32.204208
who             
child         83
man          537
woman        271


In [13]:
### Task 2:
### 2.1 - Import libraries. Create a Students Performance Dataset.
import pandas as pd

# Create the students performance dataset
data_performance = {
    'student_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank', 'Grace', 'Henry', 'Ivy', 'John'],
    'math_score': [85, 92, 78, 88, 91, 76, 89, 94, 87, 95],
    'english_score': [88, 79, 85, 90, 78, 83, 86, 92, 89, 91]
}

df_performance = pd.DataFrame(data_performance)
print("Students Performance Dataset:")
print(df_performance)

Students Performance Dataset:
   student_id     name  math_score  english_score
0           1    Alice          85             88
1           2      Bob          92             79
2           3  Charlie          78             85
3           4    David          88             90
4           5      Eva          91             78
5           6    Frank          76             83
6           7    Grace          89             86
7           8    Henry          94             92
8           9      Ivy          87             89
9          10     John          95             91


In [14]:
### 2.2 - Create a Students Attendance Dataset.
# Create the students attendance dataset
data_attendance = {
    'student_id': [1, 2, 3, 4, 5, 6, 11, 12, 13, 14],
    'attendance_days': [180, 175, 190, 185, 178, 172, 160, 165, 170, 175]
}

df_attendance = pd.DataFrame(data_attendance)
print("\nStudents Attendance Dataset:")
print(df_attendance)


Students Attendance Dataset:
   student_id  attendance_days
0           1              180
1           2              175
2           3              190
3           4              185
4           5              178
5           6              172
6          11              160
7          12              165
8          13              170
9          14              175


In [15]:
### 2.3 - Merging Datasets: Merge the Performance and Attendance Datasets on student_id.
# Merge the performance and attendance datasets on 'student_id'
df_merged = pd.merge(df_performance, df_attendance, on='student_id', how='inner')
print("\nMerged Dataset (Inner Join):")
print(df_merged)


Merged Dataset (Inner Join):
   student_id     name  math_score  english_score  attendance_days
0           1    Alice          85             88              180
1           2      Bob          92             79              175
2           3  Charlie          78             85              190
3           4    David          88             90              185
4           5      Eva          91             78              178
5           6    Frank          76             83              172


In [16]:
### 2.4 - Perform a Left Merge.
# Perform a left merge to keep all students in the performance dataset
df_left_merge = pd.merge(df_performance, df_attendance, on='student_id', how='left')
print("\nLeft Merge (Keep all students in performance dataset):")
print(df_left_merge)


Left Merge (Keep all students in performance dataset):
   student_id     name  math_score  english_score  attendance_days
0           1    Alice          85             88            180.0
1           2      Bob          92             79            175.0
2           3  Charlie          78             85            190.0
3           4    David          88             90            185.0
4           5      Eva          91             78            178.0
5           6    Frank          76             83            172.0
6           7    Grace          89             86              NaN
7           8    Henry          94             92              NaN
8           9      Ivy          87             89              NaN
9          10     John          95             91              NaN


In [17]:
### 2.5 - Concatenating Datasets: Concatenate the Performance Dataset with Itself
# Concatenate the performance dataset with itself
df_concat = pd.concat([df_performance, df_performance], ignore_index=True)
print("\nConcatenated Dataset (Performance with itself):")
print(df_concat)


Concatenated Dataset (Performance with itself):
    student_id     name  math_score  english_score
0            1    Alice          85             88
1            2      Bob          92             79
2            3  Charlie          78             85
3            4    David          88             90
4            5      Eva          91             78
5            6    Frank          76             83
6            7    Grace          89             86
7            8    Henry          94             92
8            9      Ivy          87             89
9           10     John          95             91
10           1    Alice          85             88
11           2      Bob          92             79
12           3  Charlie          78             85
13           4    David          88             90
14           5      Eva          91             78
15           6    Frank          76             83
16           7    Grace          89             86
17           8    Henry          

In [18]:
### 2.6 - Concatenate the Performance and Attendance Datasets Along Columns.
# Concatenate the performance and attendance datasets along columns
df_concat_columns = pd.concat([df_performance, df_attendance], axis=1)
print("\nConcatenated Dataset Along Columns:")
print(df_concat_columns)


Concatenated Dataset Along Columns:
   student_id     name  math_score  english_score  student_id  attendance_days
0           1    Alice          85             88           1              180
1           2      Bob          92             79           2              175
2           3  Charlie          78             85           3              190
3           4    David          88             90           4              185
4           5      Eva          91             78           5              178
5           6    Frank          76             83           6              172
6           7    Grace          89             86          11              160
7           8    Henry          94             92          12              165
8           9      Ivy          87             89          13              170
9          10     John          95             91          14              175


In [19]:
### 2.7 - Combining Datasets with combine_first: Combine Datasets to Fill Missing Values.
# Creating another attendance dataset with some overlapping and missing 'attendance_days'
data_attendance_new = {
    'student_id': [1, 2, 3, 7, 8, 9, 10, 15, 16, 17],
    'attendance_days': [182, 176, 191, 175, 178, 173, 165, 169, 174, 168]
}

df_attendance_new = pd.DataFrame(data_attendance_new)
print("\nNew Students Attendance Dataset:")
print(df_attendance_new)

# Combine the attendance datasets to fill missing values
df_combined = df_attendance.combine_first(df_attendance_new)
print("\nCombined Attendance Dataset (Using combine_first):")
print(df_combined)


New Students Attendance Dataset:
   student_id  attendance_days
0           1              182
1           2              176
2           3              191
3           7              175
4           8              178
5           9              173
6          10              165
7          15              169
8          16              174
9          17              168

Combined Attendance Dataset (Using combine_first):
   student_id  attendance_days
0           1              180
1           2              175
2           3              190
3           4              185
4           5              178
5           6              172
6          11              160
7          12              165
8          13              170
9          14              175


In [20]:
### - Task 3:
### 3.1 - Import libraries
import numpy as np
import pandas as pd

### 3.2 - Use NumPy to create a large random dataset with 1,000,000 rows and 10 columns, and load it into a Pandas DataFrame.
# Set the random seed for reproducibility
np.random.seed(42)

# Create a large random dataset with 1,000,000 rows and 10 columns
data = np.random.rand(1000000, 10)

# Create column names
columns = [f'col_{i}' for i in range(10)]

# Load the dataset into a Pandas DataFrame
df_large = pd.DataFrame(data, columns=columns)

print("Large Random Dataset:")
print(df_large.head())
print(f"\nDataset shape: {df_large.shape}")

Large Random Dataset:
      col_0     col_1     col_2     col_3     col_4     col_5     col_6  \
0  0.374540  0.950714  0.731994  0.598658  0.156019  0.155995  0.058084   
1  0.020584  0.969910  0.832443  0.212339  0.181825  0.183405  0.304242   
2  0.611853  0.139494  0.292145  0.366362  0.456070  0.785176  0.199674   
3  0.607545  0.170524  0.065052  0.948886  0.965632  0.808397  0.304614   
4  0.122038  0.495177  0.034389  0.909320  0.258780  0.662522  0.311711   

      col_7     col_8     col_9  
0  0.866176  0.601115  0.708073  
1  0.524756  0.431945  0.291229  
2  0.514234  0.592415  0.046450  
3  0.097672  0.684233  0.440152  
4  0.520068  0.546710  0.184854  

Dataset shape: (1000000, 10)


In [21]:
### 3.3 - Using to_pickle and read_pickle: Save the DataFrame using the to_pickle function.
# Save the DataFrame to a pickle file
pickle_filename = 'large_dataset.pkl'
df_large.to_pickle(pickle_filename)

print(f"\nDataFrame saved to pickle file: {pickle_filename}")


DataFrame saved to pickle file: large_dataset.pkl


In [22]:
### 3.4 - Read the DataFrame back from the pickle file using the read_pickle function.
# Read the DataFrame from the pickle file
df_loaded = pd.read_pickle(pickle_filename)

print("\nLoaded DataFrame from Pickle File:")
print(df_loaded.head())
print(f"\nLoaded DataFrame shape: {df_loaded.shape}")


Loaded DataFrame from Pickle File:
      col_0     col_1     col_2     col_3     col_4     col_5     col_6  \
0  0.374540  0.950714  0.731994  0.598658  0.156019  0.155995  0.058084   
1  0.020584  0.969910  0.832443  0.212339  0.181825  0.183405  0.304242   
2  0.611853  0.139494  0.292145  0.366362  0.456070  0.785176  0.199674   
3  0.607545  0.170524  0.065052  0.948886  0.965632  0.808397  0.304614   
4  0.122038  0.495177  0.034389  0.909320  0.258780  0.662522  0.311711   

      col_7     col_8     col_9  
0  0.866176  0.601115  0.708073  
1  0.524756  0.431945  0.291229  
2  0.514234  0.592415  0.046450  
3  0.097672  0.684233  0.440152  
4  0.520068  0.546710  0.184854  

Loaded DataFrame shape: (1000000, 10)


In [24]:
### 3.5 - Verifying Data Integrity: Verify that the loaded DataFrame is identical to the original DataFrame.
# Verify that the loaded DataFrame is identical to the original DataFrame
is_identical = df_large.equals(df_loaded)
print(f"\nIs the loaded DataFrame identical to the original? {is_identical}")


Is the loaded DataFrame identical to the original? True
