In [57]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

## Q. I tried to read a data in table form using pd.read_table() and ended up with the following output. how can I make it into proper format?

In [58]:
movie_df = pd.read_table('u.user')
movie_df

Unnamed: 0,1|24|M|technician|85711
0,2|53|F|other|94043
1,3|23|M|writer|32067
2,4|24|M|technician|43537
3,5|33|F|other|15213
4,6|42|M|executive|98101
...,...
937,939|26|F|student|33319
938,940|32|M|administrator|02215
939,941|20|M|student|97229
940,942|48|F|librarian|78209


#### First, let's look at the issues.
- The data is seperated by pipe symbol "|".
- The first row is taken as column header
- There is no name for each column.

#### Issue 1
- data seperated by "|"


In [59]:
movie_df = pd.read_table('u.user', sep ="|")
movie_df

Unnamed: 0,1,24,M,technician,85711
0,2,53,F,other,94043
1,3,23,M,writer,32067
2,4,24,M,technician,43537
3,5,33,F,other,15213
4,6,42,M,executive,98101
...,...,...,...,...,...
937,939,26,F,student,33319
938,940,32,M,administrator,02215
939,941,20,M,student,97229
940,942,48,F,librarian,78209


#### Issue 2
- The first row is taken as column header

In [60]:
movie_df = pd.read_table('u.user', sep ="|", header =None)
movie_df

Unnamed: 0,0,1,2,3,4
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


#### Issue 3
- No name for each column

In [61]:
col_names =['User_id', 'Age','Gender','Occupation','pincode']
movie_df = pd.read_table('u.user', sep ="|", header =None, names = col_names)
movie_df

Unnamed: 0,User_id,Age,Gender,Occupation,pincode
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


## Q. I find missing values in the dataset given. 
   #### What are the ways in which I can drop it?

In [62]:
sightings_df = pd.read_csv('ufo.csv')
sightings_df

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00
...,...,...,...,...,...
18236,Grant Park,,TRIANGLE,IL,12/31/2000 23:00
18237,Spirit Lake,,DISK,IA,12/31/2000 23:00
18238,Eagle River,,,WI,12/31/2000 23:45
18239,Eagle River,RED,LIGHT,WI,12/31/2000 23:45


In [63]:
sightings_df.isnull() 
#converts all missing values (NaN) to Boolean True and all values to Boolean False

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,False,True,False,False,False
1,False,True,False,False,False
2,False,True,False,False,False
3,False,True,False,False,False
4,False,True,False,False,False
...,...,...,...,...,...
18236,False,True,False,False,False
18237,False,True,False,False,False
18238,False,True,True,False,False
18239,False,False,False,False,False


In [64]:
sightings_df.notnull()
#converts all values into True and all missing values (NaN) to False

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,True,False,True,True,True
1,True,False,True,True,True
2,True,False,True,True,True
3,True,False,True,True,True
4,True,False,True,True,True
...,...,...,...,...,...
18236,True,False,True,True,True
18237,True,False,True,True,True
18238,True,False,False,True,True
18239,True,True,True,True,True


In [65]:
sightings_df.isnull().sum()
# .sum() converts True to 1 and False to zero across rows by default (or through each column) and adds all 1's.


City                  25
Colors Reported    15359
Shape Reported      2644
State                  0
Time                   0
dtype: int64

#### Columns State and Time doesn't have any NaN

In [66]:
sightings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18241 entries, 0 to 18240
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   City             18216 non-null  object
 1   Colors Reported  2882 non-null   object
 2   Shape Reported   15597 non-null  object
 3   State            18241 non-null  object
 4   Time             18241 non-null  object
dtypes: object(5)
memory usage: 712.7+ KB


In [67]:
sightings_df.shape

#original number of rows and columns

(18241, 5)

### DROPPING ROWS HAVING NaN

In [68]:
sightings_df.dropna(how='any').shape

#this code looks through each row, and if ANY entry is missing (NaN), it drops the row.
# .shape gives the total number of rows and columns after the code has completed the execution on the dataset.

(2486, 5)

In [69]:
sightings_df.dropna(how='all').shape

#this code looks through each row, and if ALL entry is missing (NaN), it drops the row.
# .shape gives the total number of rows and columns after the code has completed the execution on the dataset.
# No rows are dropped since we know that columns State and Time doesn't have any NaN.

(18241, 5)

#### No rows are dropped since we know that columns State and Time doesn't have any NaN.

In [70]:
sightings_df.dropna(subset =['City']).shape

#this code looks through 'City' column of each row, and if any entry is missing (NaN), it drops the row.
# .shape gives the total number of rows and columns after the code has completed the execution on the dataset.

(18216, 5)

In [71]:
sightings_df.dropna(subset =['City','Colors Reported'], how ='any').shape

#this code looks through columns specified in the subset, and if any entry is missing (NaN) in ANY of these columns, for a particular row, it drops that row.
# .shape gives the total number of rows and columns after the code has completed the execution on the dataset.

(2877, 5)

In [72]:
sightings_df.dropna(subset=['City','Colors Reported'], how='all').shape

#this code looks through columns specified in the subset, and if ALL the entries are missing (NaN) in these columns, for a particular row, it drops that row.
# .shape gives the total number of rows and columns after the code has completed the execution on the dataset.

(18221, 5)