# Explore Data Arrays With NumPy

In [1]:
data = [50,50,47,97,49,3,53,42,26,74,82,62,37,15,70,27,36,35,48,52,63,64]
print(data)

[50, 50, 47, 97, 49, 3, 53, 42, 26, 74, 82, 62, 37, 15, 70, 27, 36, 35, 48, 52, 63, 64]


# Use the NumPy package for data analysis and manipulation

# Import Numpy

In [2]:
import numpy as np
height=np.array(data)
print(height)

[50 50 47 97 49  3 53 42 26 74 82 62 37 15 70 27 36 35 48 52 63 64]


# Demonstrate the difference between a list and NumPy array

In [3]:
print(type(data),'x 2:',data*2)
print('---')
print(type(height),'x 2:',height*2)

<class 'list'> x 2: [50, 50, 47, 97, 49, 3, 53, 42, 26, 74, 82, 62, 37, 15, 70, 27, 36, 35, 48, 52, 63, 64, 50, 50, 47, 97, 49, 3, 53, 42, 26, 74, 82, 62, 37, 15, 70, 27, 36, 35, 48, 52, 63, 64]
---
<class 'numpy.ndarray'> x 2: [100 100  94 194  98   6 106  84  52 148 164 124  74  30 140  54  72  70
  96 104 126 128]


# Look at the shape

In [4]:
height.shape

(22,)

# Access individual elements using their ordinal position

In [5]:
height[0]

50

# Find the average grade

In [6]:
height.mean()

49.18181818181818

# Add another data set record in the typical hours they spend studying

In [7]:
#define an array of study hours
import numpy as np
study_hours=[10.0,11.5,9.0,16.0,9.25,1.0,11.5,9.0,8.5,14.5,15.5,
               13.75,9.0,8.0,15.5,8.0,9.0,6.0,10.0,12.0,12.5,12.0]
#create a 2d array (an array of arrays )
student_data=np.array([study_hours,height])
#display the array
student_data

array([[10.  , 11.5 ,  9.  , 16.  ,  9.25,  1.  , 11.5 ,  9.  ,  8.5 ,
        14.5 , 15.5 , 13.75,  9.  ,  8.  , 15.5 ,  8.  ,  9.  ,  6.  ,
        10.  , 12.  , 12.5 , 12.  ],
       [50.  , 50.  , 47.  , 97.  , 49.  ,  3.  , 53.  , 42.  , 26.  ,
        74.  , 82.  , 62.  , 37.  , 15.  , 70.  , 27.  , 36.  , 35.  ,
        48.  , 52.  , 63.  , 64.  ]])

# look at the shape of the data

In [8]:
#show shape of the 2d array
student_data.shape

(2, 22)

# Navigate the structure to find the first value of the second array

In [9]:
student_data[1][0]

50.0

# Compare the mean study time to the mean height


In [10]:
#get the mean of each sub-array
avg_study=student_data[0].mean()
avg_height=student_data[1].mean()
print("Average study hours:{:2f}\n Average grade:{:.2f}".format(avg_study,avg_height))

Average study hours:10.522727
 Average grade:49.18


# import pandas library and create a dataframe with three columns -name,studyhours(perweek),grade

In [11]:
import pandas as pd
df_students=pd.DataFrame({'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic', 'Jimmie', 
                                     'Rhonda', 'Giovanni', 'Francesca', 'Rajab', 'Naiyana', 'Kian', 'Jenny',
                                     'Jakeem','Helena','Ismat','Anila','Skye','Daniel','Aisha'],
                            'StudyHours':student_data[0],
                            'Grade':student_data[1]})
df_students

Unnamed: 0,Name,StudyHours,Grade
0,Dan,10.0,50.0
1,Joann,11.5,50.0
2,Pedro,9.0,47.0
3,Rosie,16.0,97.0
4,Ethan,9.25,49.0
5,Vicky,1.0,3.0
6,Frederic,11.5,53.0
7,Jimmie,9.0,42.0
8,Rhonda,8.5,26.0
9,Giovanni,14.5,74.0


# finding and filtering data in the dataframe

In [12]:
#get the data for index value 5
df_students
.loc[5]

Name          Vicky
StudyHours      1.0
Grade           3.0
Name: 5, dtype: object

# Finding and filtering a range in the dataframe


In [13]:
#get the rows with index value from 0 to 5
df_students.loc[0:5]

Unnamed: 0,Name,StudyHours,Grade
0,Dan,10.0,50.0
1,Joann,11.5,50.0
2,Pedro,9.0,47.0
3,Rosie,16.0,97.0
4,Ethan,9.25,49.0
5,Vicky,1.0,3.0


# Use the iloc method to find the rows based on their ordinal position in the dataframe

In [15]:
#get data in the first five rows
df_students.iloc[0:5]

Unnamed: 0,Name,StudyHours,Grade
0,Dan,10.0,50.0
1,Joann,11.5,50.0
2,Pedro,9.0,47.0
3,Rosie,16.0,97.0
4,Ethan,9.25,49.0


In [16]:
df_students.iloc[0,[1,2]]

StudyHours    10.0
Grade         50.0
Name: 0, dtype: object

In [17]:
df_students.loc[0,'Grade']

50.0

# Find indexed rows based on filtering expressions

In [19]:
df_students.loc[df_students["Name"]=="Aisha"]

Unnamed: 0,Name,StudyHours,Grade
21,Aisha,12.0,64.0


In [22]:
df_students.loc[df_students['Name']=="Ethan"]

Unnamed: 0,Name,StudyHours,Grade
4,Ethan,9.25,49.0


# Applying a dataframe filtering expression without the loc method

In [23]:
df_students[df_students["Name"]=="Aisha"]

Unnamed: 0,Name,StudyHours,Grade
21,Aisha,12.0,64.0


# Or use the dataframe query method

In [26]:
df_students.query('Name=="Aisha"')

Unnamed: 0,Name,StudyHours,Grade
21,Aisha,12.0,64.0


# or use the column as a property of the dataframe

In [27]:
df_students[df_students.Name=="Aisha"]


Unnamed: 0,Name,StudyHours,Grade
21,Aisha,12.0,64.0


# You can load data from a file

In [40]:
!wget https://raw.githubusercontent.com/MicrosoftDocs/mslearn-introduction-to-machine-learning/main/Data/ml-basics/grades.csv
import pandas as pd
# Load the CSV file into a DataFrame
df_students = pd.read_csv('grades.csv')

# Display the first few rows of the DataFrame
df_students.head()

'wget' is not recognized as an internal or external command,
operable program or batch file.


FileNotFoundError: [Errno 2] No such file or directory: 'grades.csv'

# Handle missing data using the isnull method-identify missing data

In [33]:
df_students.isnull()

Unnamed: 0,Name,StudyHours,Grade
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
5,False,False,False
6,False,False,False
7,False,False,False
8,False,False,False
9,False,False,False
