# BABAR NASEER
**Email:** kh.babar.naseer@outlook.com
#### **Assignment:** Exploring Pandas

#### Import Libraries:

In [32]:
import pandas as pd
import numpy as np


### Basic Data Structures

In [33]:
series = pd.Series([1, 3.4, 51, np.nan, 6, 89])


In [34]:
date_range = pd.date_range(start="2024-01-01", periods=6)


In [35]:
df_random = pd.DataFrame(np.random.randn(6, 4), index=date_range, columns=['A', 'B', 'C', 'D'])


In [36]:
df_mixed = pd.DataFrame({
    'A': 1.0,
    'B': pd.Timestamp("2013-01-02"),
    'C': pd.Series(1, index=list(range(4)), dtype="float32"),
    'D': np.array([3] * 4, dtype="int32"),
    'E': pd.Categorical(["test", "train", "test", "train"]),
    'F': "foo"
})


In [37]:
print(df_mixed.dtypes)


A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object


### Viewing Data

In [38]:
print(df_random.head())


                   A         B         C         D
2024-01-01 -0.781556  0.180533  0.376384  0.956896
2024-01-02  1.064062 -0.845425 -0.026342 -0.185578
2024-01-03  2.448547 -0.408963 -0.696748  0.808225
2024-01-04 -0.763298  0.039024 -1.537197  1.394768
2024-01-05  0.903209  0.621771 -0.593962 -0.538809


In [39]:
print(df_random.tail(3))


                   A         B         C         D
2024-01-04 -0.763298  0.039024 -1.537197  1.394768
2024-01-05  0.903209  0.621771 -0.593962 -0.538809
2024-01-06 -0.842682  0.889949  0.526960 -1.821900


In [40]:
print(df_mixed.index)
print(df_random.columns)


Index([0, 1, 2, 3], dtype='int64')
Index(['A', 'B', 'C', 'D'], dtype='object')


In [41]:
numpy_array_random = df_random.to_numpy()
numpy_array_mixed = df_mixed.to_numpy()


In [42]:
print(df_random.describe())


              A         B         C         D
count  6.000000  6.000000  6.000000  6.000000
mean   0.338047  0.079482 -0.325151  0.102267
std    1.353611  0.641376  0.772908  1.191266
min   -0.842682 -0.845425 -1.537197 -1.821900
25%   -0.776992 -0.296966 -0.671051 -0.450501
50%    0.069956  0.109779 -0.310152  0.311324
75%    1.023849  0.511461  0.275703  0.919728
max    2.448547  0.889949  0.526960  1.394768


In [43]:
df_transposed = df_random.T
print(df_transposed)


   2024-01-01  2024-01-02  2024-01-03  2024-01-04  2024-01-05  2024-01-06
A   -0.781556    1.064062    2.448547   -0.763298    0.903209   -0.842682
B    0.180533   -0.845425   -0.408963    0.039024    0.621771    0.889949
C    0.376384   -0.026342   -0.696748   -1.537197   -0.593962    0.526960
D    0.956896   -0.185578    0.808225    1.394768   -0.538809   -1.821900


In [44]:
sorted_df_by_index = df_random.sort_index(axis=1, ascending=False)
sorted_df_by_values = df_random.sort_values(by='B')


### Data Selection

In [45]:
column_A = df_random['A']
first_three_rows = df_random.iloc[:3]
rows_from_date = df_random['2024-01-02':'2024-01-04']


In [46]:
data_specific_date = df_random.loc['2024-01-02']
rows_for_AB = df_random.loc[:, ['A', 'B']]
rows_for_AB_by_date = df_random.loc['2024-01-02':'2024-01-04', ['A', 'B']]


In [47]:
fourth_row = df_random.iloc[3]
rows_4_to_5_columns_0_to_2 = df_random.iloc[3:5, 0:2]
rows_positions = df_random.iloc[[1, 2, 4], [0, 2]]
rows_2_to_3_all_columns = df_random.iloc[2:4, :]
all_rows_columns_2_to_3 = df_random.iloc[:, 2:4]


### Data Manipulation

In [48]:
df_copy = df_random.copy()
df_copy['E'] = ["one", "one", "two", "three", "four", "three"]


In [49]:
reindexed_df = df_random.reindex(date_range[:4])
reindexed_df['E'] = [1, 1, np.nan, np.nan]


dropped_na_df = reindexed_df.dropna()

# Fill missing data
filled_na_df = reindexed_df.fillna(5)


print(reindexed_df.isna())


                A      B      C      D      E
2024-01-01  False  False  False  False  False
2024-01-02  False  False  False  False  False
2024-01-03  False  False  False  False   True
2024-01-04  False  False  False  False   True


In [50]:
mean_df = df_random.mean()
mean_along_axis_1 = df_random.mean(axis=1)
shifted_series = pd.Series([1, 3, 5, np.nan, 6, 8], index=df_random.index).shift(2)
subtracted_df = df_random.sub(shifted_series, axis=0)


In [51]:
df_mean_multiplied = df_random.apply(lambda x: x.mean() * 5.6)
df_multiplied = df_random.applymap(lambda x: x * 101.2)


  df_multiplied = df_random.applymap(lambda x: x * 101.2)


### Miscellaneous

In [52]:
random_integers = pd.Series(np.random.randint(0, 7, size=10))
print(random_integers.value_counts())


2    5
4    3
3    2
Name: count, dtype: int64


In [53]:
string_series = pd.Series(["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"])
lowercase_series = string_series.str.lower()


### Merging and Grouping

In [54]:
d1 = pd.DataFrame({'key': ['A', 'B', 'C'], 'value': [1, 2, 3]})
d2 = pd.DataFrame({'key': ['A', 'B', 'D'], 'value': [4, 5, 6]})
merged_df = pd.merge(d1, d2, on='key')

d1_multi = pd.DataFrame({'key1': ['A', 'B', 'C'], 'key2': ['K', 'L', 'M'], 'value': [1, 2, 3]})
d2_multi = pd.DataFrame({'key1': ['A', 'B', 'D'], 'key2': ['K', 'L', 'N'], 'value': [4, 5, 6]})
merged_multi_df = pd.merge(d1_multi, d2_multi, on=['key1', 'key2'])


In [55]:
df_group = pd.DataFrame({
    'Category': ['A', 'A', 'B', 'B'],
    'Value': [10, 15, 10, 15]
})
grouped_df = df_group.groupby('Category').sum()

df_group_multi = pd.DataFrame({
    'Category': ['A', 'A', 'B', 'B'],
    'Subcategory': ['X', 'Y', 'X', 'Y'],
    'Value': [10, 15, 10, 15]
})
grouped_multi_df = df_group_multi.groupby(['Category', 'Subcategory']).sum()


### Reshaping and Pivot Tables

In [56]:
df_reshape = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6],
    'C': [7, 8, 9]
}, index=['one', 'two', 'three'])
stacked_df = df_reshape.stack()


In [57]:
df_pivot = pd.DataFrame({
    'Date': pd.date_range(start="2024-01-01", periods=6),
    'City': ['A', 'B', 'A', 'B', 'A', 'B'],
    'Sales': [100, 200, 150, 250, 130, 270]
})
pivot_table = df_pivot.pivot_table(values='Sales', index='Date', columns='City', aggfunc='sum')


### Categorical Data

In [58]:
df_categorical = pd.DataFrame({
    'id': [1, 2, 3, 4],
    'raw_grade': ['a', 'b', 'b', 'a']
})

df_categorical['raw_grade'] = df_categorical['raw_grade'].astype('category')

# Correctly rename the categories
df_categorical['raw_grade'] = df_categorical['raw_grade'].cat.rename_categories(["very bad", "bad"])

# Set new categories with the correct order
df_categorical['raw_grade'] = df_categorical['raw_grade'].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])

# Sort the DataFrame by the grade column
df_categorical = df_categorical.sort_values(by='raw_grade')

# Group by the grade column and calculate the size of each group
grouped_categorical = df_categorical.groupby('raw_grade').size()

print(df_categorical)
print(grouped_categorical)


   id raw_grade
0   1  very bad
3   4  very bad
1   2       bad
2   3       bad
raw_grade
very bad     2
bad          2
medium       0
good         0
very good    0
dtype: int64


  grouped_categorical = df_categorical.groupby('raw_grade').size()
