## Basic pandas with Series and DataFrame

In [32]:
# import necessary libraries
import pandas as pd

In [33]:
# Create a simple list
data = [1,2,3,4,5]

In [34]:
# print the list
data

[1, 2, 3, 4, 5]

In [35]:
# Check the type of the data
type(data)

list

In [36]:
# Convert the list to a pandas Series
s = pd.Series(data)
s

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [37]:
# Check the type of the Series
type(s)

pandas.core.series.Series

In [38]:
# Create a DataFrame from the Series
df = pd.DataFrame(s, columns=['num'])
df

Unnamed: 0,num
0,1
1,2
2,3
3,4
4,5


In [39]:
# Create a DataFrame from list
df1 = pd.DataFrame(data, columns=['num2'])
df1

Unnamed: 0,num2
0,1
1,2
2,3
3,4
4,5


In [40]:
# Convert the Series to a DataFrame
s2 = df1['num2']
s2

0    1
1    2
2    3
3    4
4    5
Name: num2, dtype: int64

In [41]:
# Check the type of the Series converted to DataFrame
type(s2)

pandas.core.series.Series

In [42]:
# Create list
data = [[1,2],[3,4],[5,6]]
data

[[1, 2], [3, 4], [5, 6]]

In [43]:
# Create a DataFrame from the list
df = pd.DataFrame(data, columns=['num1', 'num2'])
df

Unnamed: 0,num1,num2
0,1,2
1,3,4
2,5,6


In [44]:
# Select specific columns from the DataFrame
s = df[['num1', 'num2']]
s

Unnamed: 0,num1,num2
0,1,2
1,3,4
2,5,6


In [45]:
type(s)

pandas.core.frame.DataFrame

## Dictionary to series and DataFrame

In [46]:
# import necessary libraries
import pandas as pd

In [47]:
# create a dictionary with data
data = {
    'name' : ['Alice', 'Bob', 'Charlie'],
    'age' : [25, 30, 35],
    'city' : ['New York', 'Los Angeles', 'Chicago']
}
data

{'name': ['Alice', 'Bob', 'Charlie'],
 'age': [25, 30, 35],
 'city': ['New York', 'Los Angeles', 'Chicago']}

In [48]:
# create a Series for name from the dictionary
name_series = pd.Series(data['name'], name='name')
name_series

0      Alice
1        Bob
2    Charlie
Name: name, dtype: object

In [49]:
# create a DataFrame for age
df = pd.DataFrame(data)
df

Unnamed: 0,name,age,city
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago


In [50]:
# Create a Series for age
age_series = pd.Series(data['age'], name='age')
age_series

0    25
1    30
2    35
Name: age, dtype: int64

## Set to Series and DataFrame

In [51]:
# import necessary libraries
import pandas as pd

In [52]:
# Create a set
s1 = {1,2,3,4,3,2,1}
s1

{1, 2, 3, 4}

In [53]:
# Check the type of the set
type(s1)

set

In [54]:
# Convert the set to list and convert into a pandas Series
s = pd.Series(list(s1))
s

0    1
1    2
2    3
3    4
dtype: int64

In [55]:
# Create a DataFrame from the set
df = pd.DataFrame(s1, columns=['set_value'] )
df

Unnamed: 0,set_value
0,1
1,2
2,3
3,4


## Tuple to Series and DataFrame 

In [56]:
t = (1,2,4,5,6,7,8, True, False, 'hello', 'world')
s = pd.Series(t)
s

0         1
1         2
2         4
3         5
4         6
5         7
6         8
7      True
8     False
9     hello
10    world
dtype: object

In [57]:
df = pd.DataFrame(t, columns = ['tuple_value'])
df

Unnamed: 0,tuple_value
0,1
1,2
2,4
3,5
4,6
5,7
6,8
7,True
8,False
9,hello


In [58]:
# Check the DataFrame information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   tuple_value  11 non-null     object
dtypes: object(1)
memory usage: 220.0+ bytes


## Numeric Data Type in Pandas 

In [59]:
import pandas as pd

In [60]:
# creating DataFrame with integer data
date = {
    'age': [25, 30, 35],
    'salary': [50000, 60000, 70000],
}
df = pd.DataFrame(date)
df

Unnamed: 0,age,salary
0,25,50000
1,30,60000
2,35,70000


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   age     3 non-null      int64
 1   salary  3 non-null      int64
dtypes: int64(2)
memory usage: 180.0 bytes


In [62]:
df.dtypes

age       int64
salary    int64
dtype: object

In [63]:
# changing the data type to int32
df['age'] = df['age'].astype('int32')
df['salary'] = df['salary'].astype('int32')
df

Unnamed: 0,age,salary
0,25,50000
1,30,60000
2,35,70000


In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   age     3 non-null      int32
 1   salary  3 non-null      int32
dtypes: int32(2)
memory usage: 156.0 bytes


In [65]:
2**32

4294967296

In [66]:
2**32/2

2147483648.0

In [67]:
# changing the data type to int16
df['age'] = df['age'].astype('int16')
df['salary'] = df['salary'].astype('int16')
df

Unnamed: 0,age,salary
0,25,-15536
1,30,-5536
2,35,4464


In [68]:
2**15

32768

In [69]:
# creating DataFrame with integer data
date = {
    'age': [25, 30, 35],
    'salary': [50000, 60000, 70000],
}
df = pd.DataFrame(date)
df

Unnamed: 0,age,salary
0,25,50000
1,30,60000
2,35,70000


In [70]:
# changing the data type to float64
df['age'] = df['age'].astype('float64')
df['salary'] = df['salary'].astype('float64')
df

Unnamed: 0,age,salary
0,25.0,50000.0
1,30.0,60000.0
2,35.0,70000.0


In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     3 non-null      float64
 1   salary  3 non-null      float64
dtypes: float64(2)
memory usage: 180.0 bytes


## Object, Datetime Datatype

In [72]:
import pandas as pd

In [73]:
data = {
    'name': ['Alice', 'Bob', 'Charlie'],
    'age': [25, 30, 35],
    'city': ['New York', 'Los Angeles', 'Chicago'],
    'join_date': ['2020-01-01', '2021-02-15', '2019-03-20']
}
data

{'name': ['Alice', 'Bob', 'Charlie'],
 'age': [25, 30, 35],
 'city': ['New York', 'Los Angeles', 'Chicago'],
 'join_date': ['2020-01-01', '2021-02-15', '2019-03-20']}

In [74]:
df = pd.DataFrame(data)
print(df.dtypes)

name         object
age           int64
city         object
join_date    object
dtype: object


In [75]:
df

Unnamed: 0,name,age,city,join_date
0,Alice,25,New York,2020-01-01
1,Bob,30,Los Angeles,2021-02-15
2,Charlie,35,Chicago,2019-03-20


In [76]:
# create a list of dates
dates = ['2020-01-01', '2021-02-15', '2019-03-20']
# create a DataFrame with dates
df_dates = pd.DataFrame({'dates': dates})
df_dates

Unnamed: 0,dates
0,2020-01-01
1,2021-02-15
2,2019-03-20


In [77]:
# check the data types of the DataFrame
df_dates['dates'].dtype

dtype('O')

In [78]:
# covert the 'dates' column to datetime
timestamps = pd.to_datetime(dates)
df2 = pd.DataFrame({'dates': timestamps})
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   dates   3 non-null      datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 156.0 bytes


In [79]:
df2

Unnamed: 0,dates
0,2020-01-01
1,2021-02-15
2,2019-03-20


In [99]:
import pandas as pd

# create a list of dates
dates = ['2020', '2021-05-9', '2019']

# create a DataFrame with dates
# df_dates = pd.DataFrame({'dates': dates})

# convert the list to datetime
timestamps = pd.to_datetime(dates, format='mixed')

# create a new DataFrame with datetime values
df2 = pd.DataFrame({'dates2': timestamps})

# check the data type of the 'dates2' column
print(df2['dates2'].dtype)


# print(dates)
# timestamps = pd.to_datetime(dates, errors='coerce')
# invalid_dates = df_dates[pd.to_datetime(df_dates['dates'], errors='coerce').isna()]
# print("Invalid dates:", invalid_dates)
# timestamps = pd.to_datetime(dates, format = '%Y-%m-%d', errors='coerce')
# timestamps

datetime64[ns]


In [101]:
df2['dates2'].dtype

dtype('<M8[ns]')

In [102]:
df2

Unnamed: 0,dates2
0,2020-01-01
1,2021-05-09
2,2019-01-01


In [103]:
datetime = ['2020-10-01 12:00:00.2521414484', '2021-05-09 14:30:00.1234567890', '2019-03-20 08:15:30.9876543210']
timestamps = pd.to_datetime(datetime)
df = pd.DataFrame({'datetime': timestamps})

In [104]:
print(df['datetime'].dtype)

datetime64[ns]


In [105]:
df

Unnamed: 0,datetime
0,2020-10-01 12:00:00.252141448
1,2021-05-09 14:30:00.123456789
2,2019-03-20 08:15:30.987654321


In [106]:
timestamps = pd.to_datetime(datetime)
series = pd.Series(timestamps)
series

0   2020-10-01 12:00:00.252141448
1   2021-05-09 14:30:00.123456789
2   2019-03-20 08:15:30.987654321
dtype: datetime64[ns]

In [107]:
series.info()

<class 'pandas.core.series.Series'>
RangeIndex: 3 entries, 0 to 2
Series name: None
Non-Null Count  Dtype         
--------------  -----         
3 non-null      datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 156.0 bytes


### Pandas library
Useful for Data Processing & Analysis

Pandas DataFrame:
Pandas DataFrame is two-dimensional tabular data structure with labeled axes (rows and columns).