# Module: Pandas Exercise
### 1: DataFrame Creation and Indexing

1. Create a Pandas DataFrame with 4 columns and 6 rows filled with random integers. Set the index to be the first column.

In [1]:
import pandas as pd
import numpy as np

In [2]:
data =pd.DataFrame(np.random.randint(1,16, size=(6, 4)), columns=['A', 'B', 'C', 'D'])
print("Original DataFrame\n",data)

#Set the index to be the first column.
data.set_index('A', inplace=True)
print("DataFrame with new index\n",data)

Original DataFrame
     A   B   C   D
0   2   2   6   5
1   7   9   9   5
2   5   8   2  10
3  13  10   8   9
4  15   6  15   4
5   7   5   9   9
DataFrame with new index
      B   C   D
A             
2    2   6   5
7    9   9   5
5    8   2  10
13  10   8   9
15   6  15   4
7    5   9   9


2. Create a Pandas DataFrame with columns 'A', 'B', 'C' and index 'X', 'Y', 'Z'. Fill the DataFrame with random integers and access the element at row 'Y' and column 'B'.

In [5]:
array =pd.DataFrame(np.random.randint(1,16, size =(3,3)),columns=['A', 'B', 'C'],index=['X', 'Y', 'Z'])
print("Original DataFrame\n",array)

#access the element at row 'Y' and column 'B'
element = array.at['Y', 'B']
print("\nElement at row 'Y' and column 'B'\n",element)

Original DataFrame
     A   B   C
X  12   8  14
Y   6  12   9
Z  14  13   4

Element at row 'Y' and column 'B'
 12


### 2: DataFrame Operations

1. Create a Pandas DataFrame with 3 columns and 5 rows filled with random integers. Add a new column that is the product of the first two columns.

In [6]:
array =pd.DataFrame(np.random.randint(1,16, size =(5,3)),columns=['A', 'B', 'C'])
print("Original DataFrame\n",array)

#Add a new column that is the product of the first two columns

array['D']=array['A']*array['B']
print("\nDataFrame with new column\n",array)


Original DataFrame
    A   B   C
0  1   7   5
1  7   7   5
2  8   6   9
3  6  15  13
4  1   2  15

DataFrame with new column
    A   B   C   D
0  1   7   5   7
1  7   7   5  49
2  8   6   9  48
3  6  15  13  90
4  1   2  15   2


2. Create a Pandas DataFrame with 3 columns and 4 rows filled with random integers. Compute the row-wise and column-wise sum.

In [7]:
array =pd.DataFrame(np.random.randint(1,16, size =(4,3)),columns=['A', 'B', 'C'])
print("Original DataFrame\n",array)

#Compute the row-wise and column-wise sum

row_sum = array.sum(axis=1)
print("\nsum of rows\n",row_sum)

column_sum = array.sum(axis=0)
print("\nsum of columns\n",column_sum)

Original DataFrame
     A   B   C
0   8  12   3
1  10   5  11
2   7   8  15
3   6  10  12

sum of rows
 0    23
1    26
2    30
3    28
dtype: int64

sum of columns
 A    31
B    35
C    41
dtype: int64


### 3: Data Cleaning

1. Create a Pandas DataFrame with 3 columns and 5 rows filled with random integers. Introduce some NaN values. Fill the NaN values with the mean of the respective columns.

In [8]:
array =pd.DataFrame(np.random.randint(1,16, size =(5,3)),columns=['A', 'B', 'C'])
print("Original DataFrame\n",array)

#Introduce some NaN values

array.iloc[0, 0] = np.nan
array.iloc[1, 2] = np.nan
array.iloc[2, 0] = np.nan
print("\nDataFrame with NaN\n",array)

#Fill the NaN values with the mean of the respective columns.

array = array.fillna(array.mean())
print("\nDataFrame with NaN filled with mean\n",array)

Original DataFrame
     A  B   C
0   5  6   4
1   5  4  15
2  12  3  15
3   5  7  12
4   9  2  14

DataFrame with NaN
      A  B     C
0  NaN  6   4.0
1  5.0  4   NaN
2  NaN  3  15.0
3  5.0  7  12.0
4  9.0  2  14.0

DataFrame with NaN filled with mean
           A  B      C
0  6.333333  6   4.00
1  5.000000  4  11.25
2  6.333333  3  15.00
3  5.000000  7  12.00
4  9.000000  2  14.00


2. Create a Pandas DataFrame with 4 columns and 6 rows filled with random integers. Introduce some NaN values. Drop the rows with any NaN values.

In [2]:
df = pd.DataFrame(np.random.randint(1,16, size =(6,4)),columns=['A', 'B', 'C','D'])

#print the original dataframe
print("Original DataFrame\n",df)

df.iloc[0,2] = np.nan
df.iloc[1,3] = np.nan
print("\nDataFrame with NaN\n",df)

#drop the rows with NaN
df.dropna(inplace=True)
print("\nDataFrame without NaN\n",df)


Original DataFrame
     A   B   C   D
0  13  10  13  14
1   1  12  12   3
2  11   6  12   6
3   9  10   5   4
4   7   7   8  10
5   6   2   3   2

DataFrame with NaN
     A   B     C     D
0  13  10   NaN  14.0
1   1  12  12.0   NaN
2  11   6  12.0   6.0
3   9  10   5.0   4.0
4   7   7   8.0  10.0
5   6   2   3.0   2.0

DataFrame without NaN
     A   B     C     D
2  11   6  12.0   6.0
3   9  10   5.0   4.0
4   7   7   8.0  10.0
5   6   2   3.0   2.0


### 4: Data Aggregation

1. Create a Pandas DataFrame with 2 columns: 'Category' and 'Value'. Fill the 'Category' column with random categories ('A', 'B', 'C') and the 'Value' column with random integers. Group the DataFrame by 'Category' and compute the sum and mean of 'Value' for each category.

In [3]:
df = pd.DataFrame({'Category':np.random.choice(['A', 'B', 'C'], size=10), 'Value':np.random.randint(1, 100, size=10)})

#print the original dataframe
print("Original DataFrame\n",df)

#Group the DataFrame by 'Category' and compute the sum and mean of 'Value' for each category
grouped_df = df.groupby('Category')['Value'].agg(['sum', 'mean'])
print("\nGrouped DataFrame\n",grouped_df)

Original DataFrame
   Category  Value
0        B     18
1        A      9
2        A     16
3        A     44
4        B     87
5        B     49
6        C     37
7        A     48
8        B     61
9        B     71

Grouped DataFrame
           sum   mean
Category            
A         117  29.25
B         286  57.20
C          37  37.00


2. Create a Pandas DataFrame with 3 columns: 'Product', 'Category', and 'Sales'. Fill the DataFrame with random data. Group the DataFrame by 'Category' and compute the total sales for each category.

In [7]:
df = pd.DataFrame({'Product':np.random.choice(['Apple', 'Banana','Orange', 'Peach'],size=10),
                   'Category':np.random.choice(['A', 'B', 'C'],size=10),
                   'Sales':np.random.randint(50, 200, size=10)
                   })

#print the original dataframe
print("Original DataFrame\n",df)

#Group the DataFrame by 'Category' and compute the total sales for each category.
Category_total_sales = df.groupby('Category')['Sales'].sum()
print("\nCategory total sales\n",Category_total_sales)

Original DataFrame
   Product Category  Sales
0  Orange        C    126
1  Banana        B    110
2  Orange        A    127
3  Banana        A    149
4  Orange        B    120
5   Apple        C    120
6  Orange        C    131
7  Orange        C     85
8  Orange        A    187
9   Apple        A     53

Category total sales
 Category
A    516
B    230
C    462
Name: Sales, dtype: int32


### 5: Merging DataFrames

1. Create two Pandas DataFrames with a common column. Merge the DataFrames using the common column.

In [11]:
df1 =pd.DataFrame({'Category':np.random.choice(['A', 'B', 'C'], size=5), 'Value':np.random.randint(1, 100, size=5)})
df2 =pd.DataFrame({'Category':np.random.choice(['A', 'B', 'D'], size=5), 'Value1':np.random.randint(1, 100, size=5)})

#print the original dataframe
print("Original DataFrame\n",df1)
print("\nOriginal DataFrame\n",df2)

#Merge the two DataFrames on the 'Category' column
merged_df = pd.merge(df1,df2, on='Category', how='outer')
print("\nMerged DataFrame\n",merged_df)

Original DataFrame
   Category  Value
0        A     49
1        A     67
2        A     40
3        B     82
4        C     95

Original DataFrame
   Category  Value1
0        A      73
1        B      98
2        D       4
3        B      37
4        D       6

Merged DataFrame
   Category  Value  Value1
0        A   49.0    73.0
1        A   67.0    73.0
2        A   40.0    73.0
3        B   82.0    98.0
4        B   82.0    37.0
5        C   95.0     NaN
6        D    NaN     4.0
7        D    NaN     6.0


2. Create two Pandas DataFrames with different columns. Concatenate the DataFrames along the rows and along the columns.

In [14]:
# Create two Pandas DataFrames with different columns
df1 = pd.DataFrame({'A': np.random.randint(1, 100, size=3), 'B': np.random.randint(1, 100, size=3)})
df2 = pd.DataFrame({'C': np.random.randint(1, 100, size=3), 'D': np.random.randint(1, 100, size=3)})

#print the original dataframe
print("Original DataFrame\n",df1)
print("\nOriginal DataFrame\n",df2)

#Concatenate the DataFrames along the rows and along the columns.
concatenated_df = pd.concat([df1,df2], axis=1)
print("\nConcatenated DataFrame\n",concatenated_df)

Original DataFrame
     A   B
0  50  60
1  73  95
2  56  49

Original DataFrame
     C   D
0  31  65
1  24  62
2  75  88

Concatenated DataFrame
     A   B   C   D
0  50  60  31  65
1  73  95  24  62
2  56  49  75  88


### 6: Time Series Analysis

1. Create a Pandas DataFrame with a datetime index and one column filled with random integers. Resample the DataFrame to compute the monthly mean of the values.

In [15]:
date_range =pd.date_range(start='2022-01-01', end='2022-06-10', freq='D')
df =pd.DataFrame(date_range, columns=['Date'])
df['Data'] = np.random.randint(1, 100, size=len(date_range))
df.set_index('Date', inplace=True)

#print the original dataframe
print("Original DataFrame\n",df)

#Resample the DataFrame to compute the monthly mean of the values.
monthly_mean = df.resample('M').mean()
print("\nMonthly mean\n",monthly_mean)

Original DataFrame
             Data
Date            
2022-01-01    53
2022-01-02     1
2022-01-03    47
2022-01-04    80
2022-01-05    70
...          ...
2022-06-06    65
2022-06-07    24
2022-06-08    62
2022-06-09    22
2022-06-10    74

[161 rows x 1 columns]

Monthly mean
                  Data
Date                 
2022-01-31  47.516129
2022-02-28  49.785714
2022-03-31  50.451613
2022-04-30  45.800000
2022-05-31  47.064516
2022-06-30  46.600000


  monthly_mean = df.resample('M').mean()


2. Create a Pandas DataFrame with a datetime index ranging from '2021-01-01' to '2021-12-31' and one column filled with random integers. Compute the rolling mean with a window of 7 days.

In [18]:
date_range =pd.date_range(start='2021-01-01', end='2021-12-31', freq='D')
df =pd.DataFrame(date_range, columns=['Date'])
df['Data'] = np.random.randint(1, 100, size=len(date_range))
df.set_index('Date', inplace=True)

#print the original dataframe
print("Original DataFrame\n",df)

#Compute the rolling mean with a window of 7 days.
rolling_mean = df.rolling(window=7).mean()
print("\nRolling mean\n",rolling_mean)

Original DataFrame
             Data
Date            
2021-01-01    52
2021-01-02    75
2021-01-03    65
2021-01-04    61
2021-01-05    64
...          ...
2021-12-27    76
2021-12-28    79
2021-12-29    72
2021-12-30    44
2021-12-31    90

[365 rows x 1 columns]

Rolling mean
                  Data
Date                 
2021-01-01        NaN
2021-01-02        NaN
2021-01-03        NaN
2021-01-04        NaN
2021-01-05        NaN
...               ...
2021-12-27  48.857143
2021-12-28  48.285714
2021-12-29  54.857143
2021-12-30  52.571429
2021-12-31  64.571429

[365 rows x 1 columns]


### 7: MultiIndex DataFrame

1. Create a Pandas DataFrame with a MultiIndex (hierarchical index). Perform some basic indexing and slicing operations on the MultiIndex DataFrame.

In [22]:
arrays =[['A','A','B','B'],['one','two','one','two']]
index = pd.MultiIndex.from_arrays(arrays, names =('Category','SubCategory'))
df = pd.DataFrame(np.random.randint(1, 100, size=(4, 3)), index=index, columns=['Value1', 'Value2', 'Value3'])

#print the original dataframe
print("Original DataFrame\n",df)

#Perform some basic indexing and slicing operations on the MultiIndex DataFrame
print("\n Indexing at category A\n",df.loc['A'])

print("\nSlicing at Category 'B' and SubCategory 'two'\n",df.loc[('B', 'two')])


Original DataFrame
                       Value1  Value2  Value3
Category SubCategory                        
A        one              45      84      84
         two              17      45      54
B        one              42      27       1
         two               6      92      99

 Indexing at category A
              Value1  Value2  Value3
SubCategory                        
one              45      84      84
two              17      45      54

Slicing at Category 'B' and SubCategory 'two'
 Value1     6
Value2    92
Value3    99
Name: (B, two), dtype: int32


2. Create a Pandas DataFrame with MultiIndex consisting of 'Category' and 'SubCategory'. Fill the DataFrame with random data and compute the sum of values for each 'Category' and 'SubCategory'.

In [9]:
array = [['Apple','Apple','Orange','Orange','Apple','Apple','Orange','Orange'],['A','B','A','B','A','B','A','B']]
index = pd.MultiIndex.from_arrays(array, names =('Category','SubCategory'))
df = pd.DataFrame(np.random.randint(1, 100, size=(8, 3)), index=index, columns=['Value1', 'Value2', 'Value3'])

#print the original dataframe
print("Original DataFrame\n",df)

#compute the sum of values for each 'Category' and 'SubCategory'.

sum_value =df.groupby(['Category', 'SubCategory'])[['Value1', 'Value2', 'Value3']].sum()
print("\nSum of values\n",sum_value)

Original DataFrame
                       Value1  Value2  Value3
Category SubCategory                        
Apple    A                24      10      82
         B                 3      12      28
Orange   A                 3      10      31
         B                19      24       3
Apple    A                10      11       3
         B                54      25      33
Orange   A                34      32      67
         B                96      18      46

Sum of values
                       Value1  Value2  Value3
Category SubCategory                        
Apple    A                34      21      85
         B                57      37      61
Orange   A                37      42      98
         B               115      42      49


### 8: Pivot Tables

1. Create a Pandas DataFrame with columns 'Date', 'Category', and 'Value'. Create a pivot table to compute the sum of 'Value' for each 'Category' by 'Date'.

In [12]:
date_rng = pd.date_range(start='2021-01-01', end='2021-01-31', freq='D')
df = pd.DataFrame({'Date': np.random.choice(date_rng,size =20), 'Category': np.random.choice(['A', 'B', 'C'], size=20), 'Value': np.random.randint(1, 100, size=20)})

#print the original dataframe
#print("Original DataFrame\n",df)

#Create a pivot table to compute the sum of 'Value' for each 'Category' by 'Date'.
pivot_table = df.pivot_table(values='Value', index='Date', columns='Category', aggfunc='sum')
print("\nPivot table\n",pivot_table)



Pivot table
 Category       A     B      C
Date                         
2021-01-05  52.0   NaN   13.0
2021-01-06   NaN   NaN   42.0
2021-01-09   NaN  24.0    NaN
2021-01-10  96.0   NaN    NaN
2021-01-11   NaN  13.0    NaN
2021-01-12  77.0   NaN    NaN
2021-01-14   NaN  44.0    NaN
2021-01-16   9.0   NaN    NaN
2021-01-17   NaN  26.0    NaN
2021-01-18  51.0  76.0    NaN
2021-01-19   NaN  52.0    NaN
2021-01-20   NaN  28.0    NaN
2021-01-21   NaN  93.0    NaN
2021-01-24   NaN   NaN  108.0
2021-01-25  94.0   NaN    NaN
2021-01-27   NaN   NaN   65.0
2021-01-31  85.0   NaN    NaN


2. Create a Pandas DataFrame with columns 'Year', 'Quarter', and 'Revenue'. Create a pivot table to compute the mean 'Revenue' for each 'Quarter' by 'Year'.

In [3]:
df = pd.DataFrame({'Year': np.random.choice(['2020', '2021', '2022'], size=20), 'Quarter': np.random.choice(['Q1', 'Q2', 'Q3', 'Q4'], size=20), 'Revenue': np.random.randint(1, 100, size=20)})

#print the original dataframe
print("Original DataFrame\n",df)

#Create a pivot table to compute the mean 'Revenue' for each 'Quarter' by 'Year'

pivot_table= df.pivot_table(values='Revenue', index='Year', columns='Quarter', aggfunc='mean')

print("\nPivot table\n",pivot_table)

Original DataFrame
     Year Quarter  Revenue
0   2020      Q4       71
1   2022      Q1       18
2   2021      Q1       63
3   2020      Q2       97
4   2021      Q2        9
5   2020      Q3       23
6   2020      Q3        4
7   2021      Q3       18
8   2020      Q4       62
9   2022      Q3       18
10  2021      Q4       97
11  2022      Q4       67
12  2021      Q2       97
13  2021      Q1       72
14  2020      Q1       18
15  2022      Q3       60
16  2022      Q2       19
17  2022      Q2       51
18  2022      Q4       10
19  2021      Q4       32

Pivot table
 Quarter    Q1    Q2    Q3    Q4
Year                           
2020     18.0  97.0  13.5  66.5
2021     67.5  53.0  18.0  64.5
2022     18.0  35.0  39.0  38.5


###  9: Applying Functions

1. Create a Pandas DataFrame with 3 columns and 5 rows filled with random integers. Apply a function that doubles the values of the DataFrame.

In [10]:
df = pd.DataFrame(np.random.randint(1,100, size=(5,3)),columns=['A', 'B', 'C'])

#print the original dataframe
print("Original DataFrame\n",df)

#Apply a function that doubles the values of the DataFrame.

df = df.map(lambda x: x*2)
print("\nDataFrame with function applied\n",df)

Original DataFrame
     A   B   C
0   9  18   6
1  95  65  73
2  98  88  48
3  67  87  27
4  31   9  36

DataFrame with function applied
      A    B    C
0   18   36   12
1  190  130  146
2  196  176   96
3  134  174   54
4   62   18   72


2. Create a Pandas DataFrame with 3 columns and 6 rows filled with random integers. Apply a lambda function to create a new column that is the sum of the existing columns.

In [13]:
df = pd.DataFrame(np.random.randint(1,100, size=(6,3)),columns=['A', 'B', 'C'])

#print the original dataframe
print("Original DataFrame\n",df)

#Apply a lambda function to create a new column that is the sum of the existing columns.

df['D'] = df.apply(lambda x: x.sum(), axis=1)
print("\nDataFrame with new column\n",df)

Original DataFrame
     A   B   C
0  49  54  22
1  15  63  41
2   2  46   6
3  66  75  17
4  75  42  99
5  81  26  86

DataFrame with new column
     A   B   C    D
0  49  54  22  125
1  15  63  41  119
2   2  46   6   54
3  66  75  17  158
4  75  42  99  216
5  81  26  86  193


### 10: Working with Text Data

1. Create a Pandas Series with 5 random text strings. Convert all the strings to uppercase.

In [15]:
text_data = pd.Series(['apple', 'banana', 'carrot', 'dates', 'eggplant'])

#print the original series
print("Original Series\n",text_data)

#Convert all the strings to uppercase.
text_data = text_data.str.upper()
print("\nUppercase Series\n",text_data)

Original Series
 0       apple
1      banana
2      carrot
3       dates
4    eggplant
dtype: object

Uppercase Series
 0       APPLE
1      BANANA
2      CARROT
3       DATES
4    EGGPLANT
dtype: object


2. Create a Pandas Series with 5 random text strings. Extract the first three characters of each string.

In [16]:
text_data = pd.Series(['apple', 'banana', 'carrot', 'dates', 'eggplant'])

#print the original series
print("Original Series\n",text_data)

#Extract the first three characters of each string.

text_data = text_data.str[:3]
print("\nFirst 3 characters of each string\n",text_data)

Original Series
 0       apple
1      banana
2      carrot
3       dates
4    eggplant
dtype: object

First 3 characters of each string
 0    app
1    ban
2    car
3    dat
4    egg
dtype: object
