# Pandas DataFrame Tutorial

A basic guide to working with pandas DataFrames in Python.

## 1. Import Pandas Library

In [1]:
import pandas as pd

## 2. Create a DataFrame

In [3]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'Age': [25, 30, 35, 28, 32],
    'City': ['New York', 'London', 'Paris', 'Tokyo', 'Sydney'],
    'Salary': [50000, 60000, 70000, 55000, 65000]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City,Salary
0,Alice,25,New York,50000
1,Bob,30,London,60000
2,Charlie,35,Paris,70000
3,Diana,28,Tokyo,55000
4,Eve,32,Sydney,65000


## 3. Read Data from CSV

In [4]:
# Save our DataFrame to CSV first
df.to_csv('sample_data.csv', index=False)

# Read it back
df_from_csv = pd.read_csv('sample_data.csv')
print(df_from_csv)

      Name  Age      City  Salary
0    Alice   25  New York   50000
1      Bob   30    London   60000
2  Charlie   35     Paris   70000
3    Diana   28     Tokyo   55000
4      Eve   32    Sydney   65000


## 4. View DataFrame Content

In [5]:
print("First 3 rows:")
print(df.head(3))

print("\nLast 2 rows:")
print(df.tail(2))

print("\nDataFrame info:")
print(df.info())

print("\nDataFrame shape:")
print(df.shape)

print("\nColumn names:")
print(df.columns.tolist())

First 3 rows:
      Name  Age      City  Salary
0    Alice   25  New York   50000
1      Bob   30    London   60000
2  Charlie   35     Paris   70000

Last 2 rows:
    Name  Age    City  Salary
3  Diana   28   Tokyo   55000
4    Eve   32  Sydney   65000

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    5 non-null      object
 1   Age     5 non-null      int64 
 2   City    5 non-null      object
 3   Salary  5 non-null      int64 
dtypes: int64(2), object(2)
memory usage: 292.0+ bytes
None

DataFrame shape:
(5, 4)

Column names:
['Name', 'Age', 'City', 'Salary']


## 5. Select Columns and Rows

In [6]:
print("Select single column:")
print(df['Name'])

print("\nSelect multiple columns:")
print(df[['Name', 'Age']])

print("\nSelect rows by index using iloc:")
print(df.iloc[0:2])

print("\nSelect specific rows and columns using loc:")
print(df.loc[0:2, ['Name', 'Salary']])

print("\nSelect row by condition:")
print(df[df['Age'] > 30])

Select single column:
0      Alice
1        Bob
2    Charlie
3      Diana
4        Eve
Name: Name, dtype: object

Select multiple columns:
      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   35
3    Diana   28
4      Eve   32

Select rows by index using iloc:
    Name  Age      City  Salary
0  Alice   25  New York   50000
1    Bob   30    London   60000

Select specific rows and columns using loc:
      Name  Salary
0    Alice   50000
1      Bob   60000
2  Charlie   70000

Select row by condition:
      Name  Age    City  Salary
2  Charlie   35   Paris   70000
4      Eve   32  Sydney   65000


## 6. Filter Data

In [None]:
print("Filter by age > 30:")
print(df[df['Age'] > 30])

print("\nFilter by salary range:")
print(df[(df['Salary'] >= 55000) & (df['Salary'] <= 65000)])

print("\nFilter by city:")
print(df[df['City'] == 'London'])

print("\nFilter by multiple cities:")
print(df[df['City'].isin(['New York', 'Paris'])])

print("\nFilter by name contains 'a':")
print(df[df['Name'].str.contains('a', case=False)])

## 7. Add and Remove Columns

In [None]:
print("Original DataFrame:")
print(df)

print("\nAdd new column:")
df['Department'] = ['IT', 'HR', 'Finance', 'Marketing', 'IT']
print(df)

print("\nAdd calculated column:")
df['Annual_Salary'] = df['Salary'] * 12
print(df)

print("\nRemove column:")
df_new = df.drop('Annual_Salary', axis=1)
print(df_new)

print("\nRemove multiple columns:")
df_minimal = df.drop(['Department', 'Annual_Salary'], axis=1)
print(df_minimal)

## 8. Basic Statistics

In [None]:
print("Describe all numeric columns:")
print(df.describe())

print("\nMean age:")
print(df['Age'].mean())

print("\nSum of salaries:")
print(df['Salary'].sum())

print("\nMax salary:")
print(df['Salary'].max())

print("\nMin age:")
print(df['Age'].min())

print("\nCount of records:")
print(df['Name'].count())

print("\nValue counts for cities:")
print(df['City'].value_counts())

## 9. Group Data

In [None]:
print("Group by Department (mean salary):")
print(df.groupby('Department')['Salary'].mean())

print("\nGroup by Department (count):")
print(df.groupby('Department').size())

print("\nGroup by Department (multiple statistics):")
print(df.groupby('Department')['Salary'].agg(['mean', 'max', 'min']))

print("\nGroup by City (mean age):")
print(df.groupby('City')['Age'].mean())

## 10. Sort Data

In [None]:
print("Sort by Age (ascending):")
print(df.sort_values('Age'))

print("\nSort by Salary (descending):")
print(df.sort_values('Salary', ascending=False))

print("\nSort by multiple columns:")
print(df.sort_values(['Department', 'Age']))

print("\nSort by index:")
print(df.sort_index())

## 11. Handle Missing Data

In [None]:
import numpy as np

df_missing = df.copy()
df_missing.loc[1, 'Age'] = np.nan
df_missing.loc[3, 'Salary'] = np.nan

print("DataFrame with missing values:")
print(df_missing)

print("\nCheck for missing values:")
print(df_missing.isnull().sum())

print("\nDrop rows with missing values:")
print(df_missing.dropna())

print("\nFill missing values:")
df_filled = df_missing.fillna({'Age': df_missing['Age'].mean(), 'Salary': df_missing['Salary'].median()})
print(df_filled)