### **Pretty print**

In [None]:
import pandas as pd
from tabulate import tabulate  #used to create a table format of various kind of data like lists, pandas DataFrames...
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie','Riya'],
    'Age': [24, 30, 22,24],
    'City': ['New York', 'Los Angeles', 'Chicago','New York']
})
def pretty_print(title, df, n):
  print(f"\n{title}:")  #the title that should be displayed as heading of table
  print(tabulate(df.head(n), headers="keys", tablefmt="psql"))
  #headers="keys" specifies that column names of the DataFrame should be used as the headers in the formatted table
  #tablefmt="psql" the table is saved in PostgreSQL-style

pretty_print("Sample DataFrame", df, 5) #specify to display 5 rows


Sample DataFrame:
+----+---------+-------+-------------+
|    | Name    |   Age | City        |
|----+---------+-------+-------------|
|  0 | Alice   |    24 | New York    |
|  1 | Bob     |    30 | Los Angeles |
|  2 | Charlie |    22 | Chicago     |
|  3 | Riya    |    24 | New York    |
+----+---------+-------+-------------+


In [None]:
unique_age=df['Age'].unique()
print(unique_age)

[24 30 22]


In [None]:
df["Age"].value_counts()

Unnamed: 0_level_0,count
Age,Unnamed: 1_level_1
24,2
30,1
22,1


In [None]:
max_age = df["Age"].max()
print(max_age)

30


### **Unique values from multiple columns**

In [None]:

data = {'state':["CA","CA","TX","NC", "SC"],'deaths':[6,11,4,5,9]}
df = pd.DataFrame(data)  #creating data frame from dictionary
df2 = df.groupby('state')['deaths'].nunique().to_frame('deaths').reset_index()

#df.groupby('state')->groups dataframe by the state column which creates groups based on unique values in state
#nunique()->computes the number of unique values in deaths column for each state group and returns count of unique death values
#to_frame('deaths')->converts the resultant series into data frames
df2.head(10)

Unnamed: 0,state,deaths
0,CA,2
1,NC,1
2,SC,1
3,TX,1


### **Sorting the columns of a dataframe**

In [None]:
data = {
    'State': ['CA', 'CA', 'TX', 'NC', 'SC', 'SC'],
    'Sex': ['M', 'M', 'F', 'M', 'M', 'M']
}
df_in_power = pd.DataFrame(data)

df_state_male = df_in_power.loc[df_in_power['Sex'] == "M"].groupby(['State']).size().to_frame('count').reset_index()
#df_in_power['Sex'] == "M"->creates a boolean series where each value is True if sex='M'
#Uses this boolean Series to filter the DataFrame, resulting in a new DataFrame containing only rows where Sex is "M"
#counts the no.of male entries in each state.

df_sorted = df_state_male.sort_values(by=['count'], ascending=False)
print(df_sorted)

  State  count
0    CA      2
2    SC      2
1    NC      1


In [None]:
df.to_csv ('./out.csv', index = None, header=True)

In [10]:
import pandas as pd
import csv
df = pd.DataFrame({"date": ["2018-09-24"], "ret":[0.00013123989025119056]})
#creating a pandas data frame with two columns
df.to_csv("out.csv", sep=',', escapechar='\\', quoting=csv.QUOTE_ALL,index=None)
#out.csv->where we are writing our dataframe in this csv file
#sep=','->csv files are separated by comma
#escapechar='\\'->used to escape any special characters in the data
#quoting=csv.QUOTE_ALL->all fields in the csv file should be enclosed in double quotes
#index=None->this helps to omits the index column in csv file


df_from_csv = pd.read_csv("out.csv")

# Print the DataFrame
print(df_from_csv)

         date       ret
0  2018-09-24  0.000131


In [11]:
import csv
df.to_csv('out.csv',sep=' ', quoting=csv.QUOTE_NONE)
pd.read_csv('out.csv')

Unnamed: 0,date ret
0,0 2018-09-24 0.00013123989025119056


### **Find an object type**

In [12]:
num_person = 10
print(f"Type of num_person: {type(num_person)}")

Type of num_person: <class 'int'>


### **Single column group by**

In [25]:
import pandas as pd

data = {
    'age_range': [20, 23, 20, 35, 44,35],
    'department': ['HR', 'IT', 'HR', 'IT', 'Finance','Finance'],
    'class': ['A', 'B', 'A', 'A', 'B','A'],
    'salary': [50000, 60000, 55000, 62000, 60000,50000],
    'hours_worked': [40, 45, 35, 50, 40,50]
}

df = pd.DataFrame(data)

In [30]:
df['mean_age_range']=df.groupby("age_range")['salary'].transform('mean')
df['sum_department']=df.groupby("department")['hours_worked'].transform('sum')
#group by department that is getting unique department and after grouping it select hours_worked column for aggregation
#.sum() ->gets the sum of hours_worked of total department
#.transform('sum')->this helps us to get sum of hours_worked values for each group i.e each row
df['length_class']=df.groupby("class").transform('size')

print(df)

   age_range department class  salary  hours_worked  mean_age_range  \
0         20         HR     A   50000            40         52500.0   
1         23         IT     B   60000            45         60000.0   
2         20         HR     A   55000            35         52500.0   
3         35         IT     A   62000            50         56000.0   
4         44    Finance     B   60000            40         60000.0   
5         35    Finance     A   50000            50         56000.0   

   sum_department  length_class  
0              75             4  
1              95             2  
2              75             4  
3              95             4  
4              90             2  
5              90             4  


### **Multiple column group by**

In [33]:
import pandas as pd

df = pd.DataFrame({
    'col_1': [1, 1, 2, 2, 3],
    'col_2': [10, 10, 20, 20, 30],
    'col_3': [100, 200, 300, 400, 500],
    'col_4': [5, 15, 25, 35, 45]
})

print(df)

   col_1  col_2  col_3  col_4
0      1     10    100      5
1      1     10    200     15
2      2     20    300     25
3      2     20    400     35
4      3     30    500     45


In [35]:
df.groupby(["col_1", "col_2"]).mean()
df.groupby(["col_1", "col_2"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,col_3,col_4
col_1,col_2,Unnamed: 2_level_1,Unnamed: 3_level_1
1,10,300,20
2,20,700,60
3,30,500,45


In [36]:
df.groupby(["col_1", "col_2"])["col_3"].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,col_3
col_1,col_2,Unnamed: 2_level_1
1,10,150.0
2,20,350.0
3,30,500.0


### **Group by and using aggregation function**

In [38]:
import pandas as pd

df = pd.DataFrame({
    'state': ['CA', 'CA', 'NY', 'NY', 'TX', 'TX', 'CA', 'NY', 'TX'],
    'num_vacc': [100, 200, 150, 200, 300, 400, 180, 100, 150]
})

print(df)

  state  num_vacc
0    CA       100
1    CA       200
2    NY       150
3    NY       200
4    TX       300
5    TX       400
6    CA       180
7    NY       100
8    TX       150


In [40]:
df_new = df.groupby("state")["num_vacc"].mean().to_frame("count").reset_index()
#grouping the based on state and select the num_vacc column from the grouped object
#Calculates the mean (average) of the num_vacc values for each group defined by the unique state values.
#returning the resulting series to data frame by the column name count
print(df_new)

  state       count
0    CA  160.000000
1    NY  150.000000
2    TX  283.333333


### **Multiple aggregation function**

In [41]:
import pandas as pd

df = pd.DataFrame({
    'source': ['A', 'A', 'B', 'B', 'C', 'C', 'C'],
    'text': ['text1', 'text2', 'text3', 'text4', 'text5', 'text6', 'text7'],
    'sent': [0.1, 0.5, 0.3, 0.2, 0.4, 0.6, 0.7]
})

print(df)


  source   text  sent
0      A  text1   0.1
1      A  text2   0.5
2      B  text3   0.3
3      B  text4   0.2
4      C  text5   0.4
5      C  text6   0.6
6      C  text7   0.7


### **Aggregation using dictionary**

In [43]:
#here for text column we are getting size of the text
#for sent column we are calculating mean
#we are applying aggregation function for 2 columns so we don't need to convert
#into to_frame because the result will be in dataframe format
df_new = df.groupby('source').agg({'text': 'size', 'sent': 'mean'}).rename(columns={'text': 'count', 'sent': 'mean_sent'}).reset_index()
print(df_new)

  source  count  mean_sent
0      A      2   0.300000
1      B      2   0.250000
2      C      3   0.566667


**Other methods of aggregation**

In [44]:
df.groupby('source')['sent'].agg(count='size', mean_sent='mean').reset_index()


Unnamed: 0,source,count,mean_sent
0,A,2,0.3
1,B,2,0.25
2,C,3,0.566667


In [45]:
df[['source', 'sent']].groupby('source').agg(['count', 'mean']).reset_index()


Unnamed: 0_level_0,source,sent,sent
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean
0,A,2,0.3
1,B,2,0.25
2,C,3,0.566667


**Aggregating by dictionary**

In [49]:
df.groupby('source')[['source', 'sent']].agg({'sent': ['size','mean']}).reset_index()

Unnamed: 0_level_0,source,sent,sent
Unnamed: 0_level_1,Unnamed: 1_level_1,size,mean
0,A,2,0.3
1,B,2,0.25
2,C,3,0.566667


Groupby with Column Aggregation and Flattening

In [50]:
import pandas as pd

# Sample DataFrame
df = pd.DataFrame({
    'id1': [1, 1, 2, 2, 3, 3, 3],
    'id2': ['A', 'A', 'B', 'B', 'C', 'C', 'C'],
    'id3': ['X', 'Y', 'X', 'Y', 'X', 'Y', 'Z'],
    'col': [10, 20, 30, 40, 50, 60, 70]
})

In [51]:
df_agg = df.groupby(["id1", "id2", "id3"]).agg({k: ['min', 'max'] for k in ['col']})

In [53]:
df_agg.columns = df_agg.columns.map('_'.join) #flattens the columns
print(df_agg.columns)
df_agg = df_agg.reset_index()

Index(['i_d_1', 'i_d_2', 'i_d_3', 'c_o_l___m_i_n', 'c_o_l___m_a_x'], dtype='object')
