# Introduction to Pandas
Pandas is a high-level data manipulation package which was built on top of Numpy. The key structures within pandas include series and Dataframes

## Series

A Series is a one-dimensional array with axis labels (an index).

In [2]:
# Importing libraries and packages
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [4]:
# Creating a Series from a list
x = pd.Series([10,20,30,40,50])
x

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [5]:
# We can access different components separately:

# Accessing the index
x.index

RangeIndex(start=0, stop=5, step=1)

In [6]:
# Accessing values
x.values

array([10, 20, 30, 40, 50])

In [7]:
# Accessing the datatype
# A Series is an ndarray, thus it's homogeneous and CANNOT store multiple datatypes
x.dtype

dtype('int64')

In [8]:
# Creating a series with an Index
data = [450, 650, 870]
Sales = Series(data, index=["Don", "Mike", "Edwin"])
Sales

Don      450
Mike     650
Edwin    870
dtype: int64

In [10]:
# Check the type
type(Sales)

pandas.core.series.Series

In [11]:
# If we check the index of Sales, we will get the values, rather than the range, because it's a string.
Sales.index

Index(['Don', 'Mike', 'Edwin'], dtype='object')

### Accessing Values

In [12]:
# You can access values using the index name
Sales["Don"]

np.int64(450)

In [13]:
# You can still use traditional indexing
Sales[0]

  Sales[0]


np.int64(450)

### Checking for conditions

In [15]:
# You can filter based on conditions.
Sales>500
# This will usually return booleans.

Don      False
Mike      True
Edwin     True
dtype: bool

In [16]:
# We can use these booleans
Sales[[False, True, True]]

Mike     650
Edwin    870
dtype: int64

In [17]:
# If we want to see values greater than 500, we can use those booleans
Sales[Sales>500]

Mike     650
Edwin    870
dtype: int64

In [18]:
# Checking the names in the index
"Don" in Sales

True

In [19]:
# False example
"Sally" in Sales

False

In [22]:
# What about this?
450 in Sales
# 450 is not an index, it's a value. Thus, it will return False.

False

### Working with Dictionaries

In [24]:
# Converting a Series to a dictionary
sales_dict = Sales.to_dict()
sales_dict

{'Don': 450, 'Mike': 650, 'Edwin': 870}

In [25]:
# Converting a dict to a series
sales_ser = Series(sales_dict)
sales_ser

Don      450
Mike     650
Edwin    870
dtype: int64

### Creating series from an existing one, and NaN values

In [27]:
# We can create a new Series from an existing Series
# If we specify names in the index that were Not there already, NaN values will be assigned
new_sales = Series(Sales, index=["Don", "Mike", "Sally", "Edwin", "lucy"])
new_sales

Don      450.0
Mike     650.0
Sally      NaN
Edwin    870.0
lucy       NaN
dtype: float64

In [31]:
# We can check if there are any NaN values in a Series.
# For this we use Numpy!
np.isnan(new_sales)

Don      False
Mike     False
Sally     True
Edwin    False
lucy      True
dtype: bool

In [32]:
# To check for null values, use Pandas!
pd.isnull(new_sales)

Don      False
Mike     False
Sally     True
Edwin    False
lucy      True
dtype: bool

### Naming components in a Series

In [35]:
# Name an index
Sales.index.name = "Sales person"
Sales

Sales person
Don      450
Mike     650
Edwin    870
dtype: int64

In [37]:
# Naming a Series
Sales.name = "Total tv sales"
Sales

Sales person
Don      450
Mike     650
Edwin    870
Name: Total tv sales, dtype: int64

## DataFrames

DataFrames are two-dimensional, size-mutable (meaning that it can change), potentially heterogeneous (can have multiple data types) tabular data structures. This data structure contains TWO labeled axes (rows and the columns).

### Creating a DataFrame

In [39]:
# Creating a DataFrame from a list
data = [["adrian", 20], ["bethany", 23], ["bob", 33]]

# When we create a DataFrame, we can specify what the column names are and the data type is
df = pd.DataFrame(data, columns=["Name", "Age"])
df

Unnamed: 0,Name,Age
0,adrian,20
1,bethany,23
2,bob,33


### Creating a DataFrame from a dictionary

In [41]:
# Dictionary with lists as values
dictlist = {
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [25, 30, 35],
    "City": ["London", "Paris", "Berlin"]
}

# Create a DataFrame
dfdict = pd.DataFrame(dictlist)

# Display the DataFrame
print(dfdict)

      Name  Age    City
0    Alice   25  London
1      Bob   30   Paris
2  Charlie   35  Berlin


In [43]:
# Adding custom indexes
dfdict = pd.DataFrame(dictlist, index=["Row1", "Row2", "Row3"])
dfdict

Unnamed: 0,Name,Age,City
Row1,Alice,25,London
Row2,Bob,30,Paris
Row3,Charlie,35,Berlin


In [45]:
# Creating a DataFrame from a list of dictionaries is the same principle
dictlist = [
    {"Name": "Alice", "Age": 25, "City": "London"},
    {"Name": "Bob", "Age": 30, "City": "Paris"},
    {"Name": "Charlie", "Age": 35, "City": "Berlin"}
]
dfdict = pd.DataFrame(dictlist)
dfdict
# Notice here no index was defined, hence automatic numbering and tabling.

Unnamed: 0,Name,Age,City
0,Alice,25,London
1,Bob,30,Paris
2,Charlie,35,Berlin


In [46]:
# Creating a DataFrame form a Series
# Create a Series
s = pd.Series([10, 20, 30, 40], name="Values")

# Convert the Series into a DataFrame
df = pd.DataFrame(s)

print(df)

   Values
0      10
1      20
2      30
3      40


In [50]:
# Adding Series to an existing DataFrame, this differs for column or row
# As a new column:
new_series = pd.Series(["London", "Paris", "Berlin", "Beirut"], name="City")

# Add the Series as a new column
df["City"] = new_series

print(df)

   Values    City
0      10  London
1      20   Paris
2      30  Berlin
3      40  Beirut


In [53]:
# As a new row:
# Existing DataFrame
df = pd.DataFrame({
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [25, 30, 35],
    "City": ["London", "Paris", "Berlin"]
})

# New data as a Series
new_row = pd.Series({"Name": "David", "Age": 40, "City": "Madrid"})

# Add the Series as a new row using pd.concat()
df = pd.concat([df, new_row.to_frame().T], ignore_index=True)

# Display the updated DataFrame
print(df)

      Name Age    City
0    Alice  25  London
1      Bob  30   Paris
2  Charlie  35  Berlin
3    David  40  Madrid


### Shifting/Changing a DataFrame's Index

In [57]:
# Make your DataFrame
df = pd.DataFrame({
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [25, 30, 35],
    "City": ["London", "Paris", "Berlin"]
})

# Set the "Name" column as the index through the following method
df = df.set_index("Name")
df

Unnamed: 0_level_0,Age,City
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Alice,25,London
Bob,30,Paris
Charlie,35,Berlin


### Filling missing values

You can use fillna() to fill missing values in your DataFrame or Series. By default, it allows you to:
* Fill with a specific value.
* Use methods like backfill or pad to propagate existing values.

In [59]:
# Example DataFrame with missing values
df = pd.DataFrame({
    "A": [1, 2, np.nan, 4],
    "B": [np.nan, 2, 3, 4]
})

Unnamed: 0,A,B
0,1.0,
1,2.0,2.0
2,,3.0
3,4.0,4.0


In [60]:
# Filling it with a chosen value:

df_filled = df.fillna(0)

print(df_filled)

     A    B
0  1.0  0.0
1  2.0  2.0
2  0.0  3.0
3  4.0  4.0


If you don’t want to fill every column with the same value, you can pass a dictionary to fillna() specifying a value for each column.

In [61]:
# Fill missing values with different values for each column
df_filled = df.fillna({"A": 99, "B": 0})

print(df_filled)

      A    B
0   1.0  0.0
1   2.0  2.0
2  99.0  3.0
3   4.0  4.0


backfill or bfill: These methods replace missing values with the next valid value in the column.

In [62]:
df_bfilled = df.fillna(method="bfill")

print(df_bfilled)

     A    B
0  1.0  2.0
1  2.0  2.0
2  4.0  3.0
3  4.0  4.0


  df_bfilled = df.fillna(method="bfill")


pad or ffill: These methods replace missing values with the previous valid value in the column.

In [63]:
# Forward fill missing values
df_ffilled = df.fillna(method="ffill")

print(df_ffilled)

     A    B
0  1.0  NaN
1  2.0  2.0
2  2.0  3.0
3  4.0  4.0


  df_ffilled = df.fillna(method="ffill")


As you can see, the forward fill didnt work for value B0 as there was no previous value for it to be filled with

You can use the limit parameter to restrict how many missing values are filled in sequence.

In [64]:
# Forward fill but only fill one missing value per sequence
df_limited = df.fillna(method="ffill", limit=1)

print(df_limited)

     A    B
0  1.0  NaN
1  2.0  2.0
2  2.0  3.0
3  4.0  4.0


  df_limited = df.fillna(method="ffill", limit=1)


### summary of filling methods

In [66]:
# Create a dictionary to represent the table
data = {
    "Method": ["fillna(value)", "bfill", "backfill", "ffill", "pad", "limit"],
    "Description": [
        "Fill missing values with a specific value (e.g., 0, mean, etc.).",
        "Fill with the next valid value (backward fill).",
        "Same as bfill.",
        "Fill with the previous valid value (forward fill).",
        "Same as ffill.",
        "Restrict the number of consecutive missing values to fill in a row or column."
    ]
}

# Convert the dictionary into a pandas DataFrame
methods_df = pd.DataFrame(data)

# Display the DataFrame
print(methods_df)

          Method                                        Description
0  fillna(value)  Fill missing values with a specific value (e.g...
1          bfill    Fill with the next valid value (backward fill).
2       backfill                                     Same as bfill.
3          ffill  Fill with the previous valid value (forward fi...
4            pad                                     Same as ffill.
5          limit  Restrict the number of consecutive missing val...


### What does .interpolate() do?

In Python 3, particularly in IPython kernel environments like Jupyter Notebook, .interpolate() is a method commonly used with pandas DataFrames and Series to fill in missing values by estimating them based on existing data.

basic syntax: df.interpolate(method='linear', axis=0, inplace=False, limit=None)

In [68]:
# Create a DataFrame with missing values
data = {'A': [1, np.nan, 3, np.nan, 5]}
df = pd.DataFrame(data)

# Interpolate missing values
df_interpolated = df.interpolate()

print(df_interpolated)

     A
0  1.0
1  2.0
2  3.0
3  4.0
4  5.0


You can specify different interpolation methods such as quadratic, polynomial, nearest, etc.
* Research these key different methods and try them out

Key Parameters of .interpolate()
* method='linear' (default) → Fills values in a straight-line fashion.
* method='polynomial' → Fits a polynomial curve.
* limit → Limits the number of consecutive NaNs to fill.
* inplace=True → Modifies the DataFrame directly.

## Dropping values

In [73]:
#Drop a column
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})

# Drop column 'B'
df = df.drop(columns=['B'])

print(df)

   A  C
0  1  7
1  2  8
2  3  9


In [74]:
# Drop a row
df = df.drop(index=[0])  # Drop row with index 0
print(df)

   A  C
1  2  8
2  3  9


In [75]:
# Dropping based on a threshold
# The thresh parameter drops rows/columns if they don’t have enough non-null values.
df = pd.DataFrame({'A': [1, np.nan, 3], 'B': [4, np.nan, np.nan], 'C': [7, 8, np.nan]})

# Drop rows that have **less than 2 non-null values**
df = df.dropna(thresh=2)

print(df)

     A    B    C
0  1.0  4.0  7.0


In [77]:
# You can drop rows by their index values
df = pd.DataFrame({'A': [1, np.nan, 3], 'B': [4, np.nan, np.nan], 'C': [7, 8, np.nan]})
df = df.drop(index=[2])  # Drop row with index 2
print(df)

     A    B    C
0  1.0  4.0  7.0
1  NaN  NaN  8.0


## Finding and Removing Duplicate Rows in Pandas

In [78]:
# Use .duplicated() to check for duplicate rows.
# Create a DataFrame with duplicate rows
df = pd.DataFrame({
    'A': [1, 2, 2, 4, 4],
    'B': ['X', 'Y', 'Y', 'Z', 'Z']
})

# Find duplicate rows
duplicates = df[df.duplicated()]

print(duplicates)
# By default, .duplicated() marks all but the first occurrence as duplicates.

   A  B
2  2  Y
4  4  Z


In [79]:
# Use .drop_duplicates() to remove duplicates.
df_unique = df.drop_duplicates()

print(df_unique)

   A  B
0  1  X
1  2  Y
3  4  Z


In [80]:
# If you want to check duplicates based on specific columns:
df[df.duplicated(subset=['A'], keep=False)]

Unnamed: 0,A,B
1,2,Y
2,2,Y
3,4,Z
4,4,Z


For the above: 
* keep='first' → Keeps the first occurrence and marks the rest as duplicates.
* keep='last' → Keeps the last occurrence.
* keep=False → Marks all duplicates.

In [82]:
# Remove duplicates based on a specific column
df_unique = df.drop_duplicates(subset=['A'])
df_unique

Unnamed: 0,A,B
0,1,X
1,2,Y
3,4,Z
