# Pandas

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('sample_data.csv')
# df.to_string()

In [10]:
print(df.head(5))

   ID      Name  Age  Salary  Experience (Years) Department  Performance Score
0   1  Person_1   50  100592                  17  Marketing                  6
1   2  Person_2   36  128018                   2      Sales                 10
2   3  Person_3   29  143252                   2         IT                 10
3   4  Person_4   42   38110                  28      Sales                  6
4   5  Person_5   40  109309                  23         IT                  1


In [11]:
print(pd.Series([9,8,7]))

0    9
1    8
2    7
dtype: int64


In [12]:
print(pd.Series([9,8,7], index=['a','b','c']))

a    9
b    8
c    7
dtype: int64


Dataframe from a list

In [16]:
data = [['alex', 10],['bob', 20],['clark', 30]]
df = pd.DataFrame(data, columns=['Name','Age'])
print(df)

    Name  Age
0   alex   10
1    bob   20
2  clark   30


Dataframe from dict

In [None]:
data = {
    "Name": ['alex', 'bob', 'clark'],
    "Age": [10,20,30]
}
df = pd.DataFrame(data) #index=['rank1','rank2',..]
print(df)

    Name  Age
0   alex   10
1    bob   20
2  clark   30
0     alex
1      bob
2    clark
Name: Name, dtype: object


Accessing columns

In [30]:
df = pd.read_csv('sample_data.csv')
print(df[['Name','Age']].head()) # df['Name']

       Name  Age
0  Person_1   50
1  Person_2   36
2  Person_3   29
3  Person_4   42
4  Person_5   40


If you want to set a specific column as the index

In [37]:
df.set_index('Name', inplace=True)
print(df.head(3))
df.reset_index(inplace=True)

          ID  Age  Salary  Experience (Years) Department  Performance Score
Name                                                                       
Person_1   1   50  100592                  17  Marketing                  6
Person_2   2   36  128018                   2      Sales                 10
Person_3   3   29  143252                   2         IT                 10


.loc and .iloc

In [53]:
# .loc → Uses labels (column names or index labels).
# .iloc → Uses integer positions (like array indexing).

print(df.loc[0:5, ["Department"]]) # loc[row_label,col_label]
print('-----------------------')
print(df.loc[df['Age']<=25, ["Name"]])
print('-----------------------')
print(df.iloc[0:4,0:-1])

  Department
0  Marketing
1      Sales
2         IT
3      Sales
4         IT
5      Sales
-----------------------
         Name
11  Person_12
13  Person_14
17  Person_18
27  Person_28
33  Person_34
38  Person_39
47  Person_48
48  Person_49
50  Person_51
71  Person_72
76  Person_77
-----------------------
       Name  ID  Age  Salary  Experience (Years) Department
0  Person_1   1   50  100592                  17  Marketing
1  Person_2   2   36  128018                   2      Sales
2  Person_3   3   29  143252                   2         IT
3  Person_4   4   42   38110                  28      Sales


adding/del column

In [61]:
df = pd.read_csv('sample_data.csv')
df['NewCol'] = df['Age'] + df['Experience (Years)']
df['Another'] = pd.Series(['x','y','z'])
print(df.head(4))
df.pop('Another')
del df['NewCol']
print(df.head(4))

   ID      Name  Age  Salary  Experience (Years) Department  \
0   1  Person_1   50  100592                  17  Marketing   
1   2  Person_2   36  128018                   2      Sales   
2   3  Person_3   29  143252                   2         IT   
3   4  Person_4   42   38110                  28      Sales   

   Performance Score  NewCol Another  
0                  6      67       x  
1                 10      38       y  
2                 10      31       z  
3                  6      70     NaN  
   ID      Name  Age  Salary  Experience (Years) Department  Performance Score
0   1  Person_1   50  100592                  17  Marketing                  6
1   2  Person_2   36  128018                   2      Sales                 10
2   3  Person_3   29  143252                   2         IT                 10
3   4  Person_4   42   38110                  28      Sales                  6


add/del rows

In [72]:
df1 = pd.DataFrame({"Name":['Vikas'],"Age":[22]})
df2 = pd.DataFrame({"Name":['Sakiv'],"Age":[69]})
df1 = pd.concat([df1,df2]) # df1.append(df2)
# df1.drop(0)

DataFrame functionalities

In [80]:
prac_df = pd.DataFrame({"Name": ["Vikas",'Sakiv'], "Age": [22,69]})
print(prac_df)
print('--------')
print(prac_df.T)
print('--------')
print('Number of dimensions:',prac_df.ndim)
print('--------')
print('Shape:', prac_df.shape)
print('--------')
print('Size:', prac_df.size)
print('--------')
print(prac_df.describe())

    Name  Age
0  Vikas   22
1  Sakiv   69
--------
          0      1
Name  Vikas  Sakiv
Age      22     69
--------
Number of dimensions: 2
--------
Shape: (2, 2)
--------
Size: 4
--------
             Age
count   2.000000
mean   45.500000
std    33.234019
min    22.000000
25%    33.750000
50%    45.500000
75%    57.250000
max    69.000000


iteration over df

In [84]:
for row in prac_df.itertuples():
    print(f'Name: {row.Name} at {row.Index}, Age: {row.Age} at {row.Index}')

Name: Vikas at 0, Age: 22 at 0
Name: Sakiv at 1, Age: 69 at 1


Cleaning

In [None]:
# empty data remove row
df = pd.read_csv('sample_data.csv')
cleandf = df.dropna() # inplace=True

# empty data replace value
cleandf = df.fillna(69)
df['Age'].fillna(69)
df['Age'].filna(df['Age'].mean(), inplace=True)

# format
# df['Date'] = pd.to_datetime(df['Date'])

# wrong
# df.loc[7, 'Duration'] = 45
'''
for x in df.index:
    if df.loc[x, "Duration"] > 120:
        df.loc[x, "Duration"] = 120
'''

#duplicate
'''
df.duplicated()
df.drop_duplicated(inplace=True)
'''