In [1]:
import pandas as pd
import numpy as np

## Create a DataFrame from List

In [3]:
students = [
  [1, 15],
  [2, 11],
  [3, 11],
  [4, 20]
]

In [15]:
df = pd.DataFrame(students, columns=["student_id","age"])

In [17]:
df

Unnamed: 0,student_id,age
0,1,15
1,2,11
2,3,11
3,4,20


## Get the Size of a DataFrame

In [25]:
df.shape

(4, 2)

In [27]:
[df.shape[0],df.shape[1]]

[4, 2]

## Display the first 3 rows

In [30]:
df.head(3)

Unnamed: 0,student_id,age
0,1,15
1,2,11
2,3,11


## Select Data

In [33]:
data = {
    'student_id': [101, 53, 128, 3],
    'name': ['Ulysses', 'William', 'Henry', 'Henry'],
    'age': [13, 10, 6, 11]
}

df = pd.DataFrame(data)

In [49]:
df[df["student_id"]==101][["name","age"]]

Unnamed: 0,name,age
0,Ulysses,13


In [53]:
df[df["student_id"]==101].loc[:,['name','age']]

Unnamed: 0,name,age
0,Ulysses,13


In [59]:
df.loc[df["student_id"]==101,["name","age"]]

Unnamed: 0,name,age
0,Ulysses,13


## Create a New Column

In [64]:
data = {
    'name': ['Piper', 'Grace', 'Georgia', 'Willow', 'Finn', 'Thomas'],
    'salary': [4548, 28150, 1103, 6593, 74576, 24433]
}
employees_df = pd.DataFrame(data)
employees_df

Unnamed: 0,name,salary
0,Piper,4548
1,Grace,28150
2,Georgia,1103
3,Willow,6593
4,Finn,74576
5,Thomas,24433


In [66]:
employees_df["bonus"]=employees_df["salary"]*2

In [68]:
employees_df

Unnamed: 0,name,salary,bonus
0,Piper,4548,9096
1,Grace,28150,56300
2,Georgia,1103,2206
3,Willow,6593,13186
4,Finn,74576,149152
5,Thomas,24433,48866


## Drop Duplicate Rows

In [71]:
data = {
    'customer_id': [1, 2, 3, 4, 5, 6],
    'name': ['Ella', 'David', 'Zachary', 'Alice', 'Finn', 'Violet'],
    'email': [
        'emily@example.com', 
        'michael@example.com', 
        'sarah@example.com', 
        'john@example.com', 
        'john@example.com', 
        'alice@example.com'
    ]
}
customers_df = pd.DataFrame(data)
customers_df

Unnamed: 0,customer_id,name,email
0,1,Ella,emily@example.com
1,2,David,michael@example.com
2,3,Zachary,sarah@example.com
3,4,Alice,john@example.com
4,5,Finn,john@example.com
5,6,Violet,alice@example.com


In [73]:
customers_df.drop_duplicates(subset='email')

Unnamed: 0,customer_id,name,email
0,1,Ella,emily@example.com
1,2,David,michael@example.com
2,3,Zachary,sarah@example.com
3,4,Alice,john@example.com
5,6,Violet,alice@example.com


In [75]:
customers_df.drop_duplicates(subset='email', keep='first', inplace=True)

## Drop missing values

In [78]:
data = {
    'student_id': [32, 217, 779, 849],
    'name': ['Piper', None, 'Georgia', 'Willow'],
    'age': [5, 19, 20, 14]
}

# Creating the DataFrame
students_df = pd.DataFrame(data)
students_df

Unnamed: 0,student_id,name,age
0,32,Piper,5
1,217,,19
2,779,Georgia,20
3,849,Willow,14


In [80]:
students_df.dropna(subset='name')

Unnamed: 0,student_id,name,age
0,32,Piper,5
2,779,Georgia,20
3,849,Willow,14


## Modify columns

In [83]:
employees_df

Unnamed: 0,name,salary,bonus
0,Piper,4548,9096
1,Grace,28150,56300
2,Georgia,1103,2206
3,Willow,6593,13186
4,Finn,74576,149152
5,Thomas,24433,48866


In [85]:
employees_df['salary']=employees_df['salary']*2

In [87]:
employees_df

Unnamed: 0,name,salary,bonus
0,Piper,9096,9096
1,Grace,56300,56300
2,Georgia,2206,2206
3,Willow,13186,13186
4,Finn,149152,149152
5,Thomas,48866,48866


In [97]:
employees_df.salary*=2
employees_df

Unnamed: 0,name,salary,bonus
0,Piper,18192.0,9096
1,Grace,112600.0,56300
2,Georgia,4412.0,2206
3,Willow,26372.0,13186
4,Finn,298304.0,149152
5,Thomas,97732.0,48866


## Rename Columns

In [100]:
data = {
    'id': [1, 2, 3, 4, 5],
    'first': ['Mason', 'Ava', 'Taylor', 'Georgia', 'Thomas'],
    'last': ['King', 'Wright', 'Hall', 'Thompson', 'Moore'],
    'age': [6, 7, 16, 18, 10]
}
df = pd.DataFrame(data)

In [102]:
df

Unnamed: 0,id,first,last,age
0,1,Mason,King,6
1,2,Ava,Wright,7
2,3,Taylor,Hall,16
3,4,Georgia,Thompson,18
4,5,Thomas,Moore,10


In [104]:
new_names={
    'id':'student_id',
    'first':'first_name',
    'last':'last_name',
    'age':'age_in_years'
}

In [106]:
df.rename(columns=new_names)

Unnamed: 0,student_id,first_name,last_name,age_in_years
0,1,Mason,King,6
1,2,Ava,Wright,7
2,3,Taylor,Hall,16
3,4,Georgia,Thompson,18
4,5,Thomas,Moore,10


In [112]:
df.columns=['student_id','first_name','last_name','age_in_years']
df

Unnamed: 0,student_id,first_name,last_name,age_in_years
0,1,Mason,King,6
1,2,Ava,Wright,7
2,3,Taylor,Hall,16
3,4,Georgia,Thompson,18
4,5,Thomas,Moore,10


## Change Data Types

In [115]:
df['age_in_years'].astype(float)

0     6.0
1     7.0
2    16.0
3    18.0
4    10.0
Name: age_in_years, dtype: float64

In [117]:
df.astype({'age_in_years':float})

Unnamed: 0,student_id,first_name,last_name,age_in_years
0,1,Mason,King,6.0
1,2,Ava,Wright,7.0
2,3,Taylor,Hall,16.0
3,4,Georgia,Thompson,18.0
4,5,Thomas,Moore,10.0


- astype(): Used for converting between many data types (e.g., int, float, str, bool, etc.).
- pd.to_datetime(): Used for converting date/time data.
- astype('category'): Used for converting columns with repeated values to a categorical data type for memory optimization.

## Fill Missing Data

In [123]:
data = {
    'student_id': [32, 217, 779, 849],
    'name': ['Piper', None, 'Georgia', 'Willow'],
    'age': [5, 19, 20, 14]
}
students_df = pd.DataFrame(data)
students_df

Unnamed: 0,student_id,name,age
0,32,Piper,5
1,217,,19
2,779,Georgia,20
3,849,Willow,14


In [126]:
students_df['name'].fillna('0')

0      Piper
1          0
2    Georgia
3     Willow
Name: name, dtype: object

In [130]:
students_df.fillna({'name':'01'})

Unnamed: 0,student_id,name,age
0,32,Piper,5
1,217,01,19
2,779,Georgia,20
3,849,Willow,14


## Reshape Data: Concatenate

In [133]:
data1 = {
    'student_id': [1, 2, 3, 4],
    'name': ['Mason', 'Ava', 'Taylor', 'Georgia'],
    'age': [8, 6, 15, 17]
}

data2 = {
    'student_id': [5, 6],
    'name': ['Leo', 'Alex'],
    'age': [7, 7]
}

df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

In [137]:
df2

Unnamed: 0,student_id,name,age
0,5,Leo,7
1,6,Alex,7


In [148]:
pd.concat([df1,df2], ignore_index=True)

Unnamed: 0,student_id,name,age
0,1,Mason,8
1,2,Ava,6
2,3,Taylor,15
3,4,Georgia,17
4,5,Leo,7
5,6,Alex,7


## Reshape Data: Pivot

In [151]:
data = {
    'city': ['Jacksonville', 'Jacksonville', 'Jacksonville', 'Jacksonville', 'Jacksonville', 
             'ElPaso', 'ElPaso', 'ElPaso', 'ElPaso', 'ElPaso'],
    'month': ['January', 'February', 'March', 'April', 'May', 
              'January', 'February', 'March', 'April', 'May'],
    'temperature': [13, 23, 38, 5, 34, 20, 6, 26, 2, 43]
}
df = pd.DataFrame(data)
df

Unnamed: 0,city,month,temperature
0,Jacksonville,January,13
1,Jacksonville,February,23
2,Jacksonville,March,38
3,Jacksonville,April,5
4,Jacksonville,May,34
5,ElPaso,January,20
6,ElPaso,February,6
7,ElPaso,March,26
8,ElPaso,April,2
9,ElPaso,May,43


In [155]:
df.pivot(index=['city'], columns=['month'], values=['temperature'])

Unnamed: 0_level_0,temperature,temperature,temperature,temperature,temperature
month,April,February,January,March,May
city,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
ElPaso,2,6,20,26,43
Jacksonville,5,23,13,38,34


In [165]:
df1=df.pivot(index=['month'], columns=['city'], values=['temperature'])
df1.reset_index(inplace=True)
df1

Unnamed: 0_level_0,month,temperature,temperature
city,Unnamed: 1_level_1,ElPaso,Jacksonville
0,April,2,5
1,February,6,23
2,January,20,13
3,March,26,38
4,May,43,34


## Reshape Data: Melt

In [168]:
data = {
    'product': ['Umbrella', 'SleepingBag'],
    'quarter_1': [417, 800],
    'quarter_2': [224, 936],
    'quarter_3': [379, 93],
    'quarter_4': [611, 875]
}
df = pd.DataFrame(data)
df

Unnamed: 0,product,quarter_1,quarter_2,quarter_3,quarter_4
0,Umbrella,417,224,379,611
1,SleepingBag,800,936,93,875


In [170]:
df.melt(id_vars='product', var_name='quarter', value_name='sales')

Unnamed: 0,product,quarter,sales
0,Umbrella,quarter_1,417
1,SleepingBag,quarter_1,800
2,Umbrella,quarter_2,224
3,SleepingBag,quarter_2,936
4,Umbrella,quarter_3,379
5,SleepingBag,quarter_3,93
6,Umbrella,quarter_4,611
7,SleepingBag,quarter_4,875


## Method Chaining

In [173]:
data = {
    'name': ['Tatiana', 'Khaled', 'Alex', 'Jonathan', 'Stefan', 'Tommy'],
    'species': ['Snake', 'Giraffe', 'Leopard', 'Monkey', 'Bear', 'Panda'],
    'age': [98, 50, 6, 45, 100, 26],
    'weight': [464, 41, 328, 463, 50, 349]
}
animals = pd.DataFrame(data)
animals

Unnamed: 0,name,species,age,weight
0,Tatiana,Snake,98,464
1,Khaled,Giraffe,50,41
2,Alex,Leopard,6,328
3,Jonathan,Monkey,45,463
4,Stefan,Bear,100,50
5,Tommy,Panda,26,349


In [179]:
animals[animals['weight']>100].sort_values('weight',ascending=False)['name']

0     Tatiana
3    Jonathan
5       Tommy
2        Alex
Name: name, dtype: object