## Series

In [None]:
import pandas as pd
import numpy as np
import random

In [4]:
first_series = pd.Series([1,2,3, np.nan ,"hello"])
first_series

0        1
1        2
2        3
3      NaN
4    hello
dtype: object

In [5]:
series = pd.Series([1,2,3, np.nan ,"hello"], index = ['A','B','C','Unknown','String'])
series
#indexing the Series with custom values

A              1
B              2
C              3
Unknown      NaN
String     hello
dtype: object

In [7]:
dict = {"Python": "Fun", "C++": "Outdated","Coding":"Hmm.."}
series = pd.Series(dict)
series
# Dict to pandas Series

Python         Fun
C++       Outdated
Coding       Hmm..
dtype: object

In [10]:
series[['Coding','Python']]

Coding    Hmm..
Python      Fun
dtype: object

In [11]:
series.index

Index(['Python', 'C++', 'Coding'], dtype='object')

In [12]:
series.values

array(['Fun', 'Outdated', 'Hmm..'], dtype=object)

In [13]:
series.describe()

count            3
unique           3
top       Outdated
freq             1
dtype: object

In [15]:
#Series is a mutable data structures and you can easily change any item’s value: 
series['Coding'] = 'Awesome'
series

Python         Fun
C++       Outdated
Coding     Awesome
dtype: object

In [17]:
# add new values:
series['Java'] = 'Okay'
series

Python         Fun
C++       Outdated
Coding     Awesome
Java          Okay
dtype: object

In [25]:
# If it is necessary to apply any mathematical operation to Series items, you may done it like below:
num_series = pd.Series([1,2,3,4,5,6,None])
num_series_changed = num_series/2
num_series_changed

0    0.5
1    1.0
2    1.5
3    2.0
4    2.5
5    3.0
6    NaN
dtype: float64

In [26]:
# NULL/NaN checking can be performed with isnull() and notnull().
print(series.isnull())
print(num_series.notnull())
print(num_series_changed.notnull())

Python    False
C++       False
Coding    False
Java      False
dtype: bool
0     True
1     True
2     True
3     True
4     True
5     True
6    False
dtype: bool
0     True
1     True
2     True
3     True
4     True
5     True
6    False
dtype: bool


## DataFrames 

In [28]:
data = {'year': [1990, 1994, 1998, 2002, 2006, 2010, 2014],
        'winner': ['Germany', 'Brazil', 'France', 'Brazil','Italy', 'Spain', 'Germany'],
        'runner-up': ['Argentina', 'Italy', 'Brazil','Germany', 'France', 'Netherlands', 'Argentina'],
        'final score': ['1-0', '0-0 (pen)', '3-0', '2-0', '1-1 (pen)', '1-0', '1-0'] }
world_cup = pd.DataFrame(data, columns=['year', 'winner', 'runner-up', 'final score'])
world_cup

Unnamed: 0,year,winner,runner-up,final score
0,1990,Germany,Argentina,1-0
1,1994,Brazil,Italy,0-0 (pen)
2,1998,France,Brazil,3-0
3,2002,Brazil,Germany,2-0
4,2006,Italy,France,1-1 (pen)
5,2010,Spain,Netherlands,1-0
6,2014,Germany,Argentina,1-0


In [29]:
# Other recipe to set a DataFrame is the using of Python list of dictionaries:

data_2 = [{'year': 1990, 'winner': 'Germany', 'runner-up': 'Argentina', 'final score': '1-0'}, 
          {'year': 1994, 'winner': 'Brazil', 'runner-up': 'Italy', 'final score': '0-0 (pen)'},
          {'year': 1998, 'winner': 'France', 'runner-up': 'Brazil', 'final score': '3-0'}, 
          {'year': 2002, 'winner': 'Brazil', 'runner-up': 'Germany', 'final score': '2-0'}, 
          {'year': 2006, 'winner': 'Italy','runner-up': 'France', 'final score': '1-1 (pen)'}, 
          {'year': 2010, 'winner': 'Spain', 'runner-up': 'Netherlands', 'final score': '1-0'}, 
          {'year': 2014, 'winner': 'Germany', 'runner-up': 'Argentina', 'final score': '1-0'}
         ]
world_cup = pd.DataFrame(data_2)
world_cup

Unnamed: 0,final score,runner-up,winner,year
0,1-0,Argentina,Germany,1990
1,0-0 (pen),Italy,Brazil,1994
2,3-0,Brazil,France,1998
3,2-0,Germany,Brazil,2002
4,1-1 (pen),France,Italy,2006
5,1-0,Netherlands,Spain,2010
6,1-0,Argentina,Germany,2014


In [37]:
print("First 2 Rows: ",end="\n\n")
print (world_cup.head(2),end="\n\n")
print ("Last 2 Rows : ",end="\n\n")
print (world_cup.tail(2),end="\n\n")
print("Using slicing : ",end="\n\n")
print (world_cup[2:4])

First 2 Rows: 

  final score  runner-up   winner  year
0         1-0  Argentina  Germany  1990
1   0-0 (pen)      Italy   Brazil  1994

Last 2 Rows : 

  final score    runner-up   winner  year
5         1-0  Netherlands    Spain  2010
6         1-0    Argentina  Germany  2014

Using slicing : 

  final score runner-up  winner  year
2         3-0    Brazil  France  1998
3         2-0   Germany  Brazil  2002


### CSV
#### Reading:

`df = pd.read_csv("path\to\the\csv\file\for\reading")`
#### Writing:

`df.to_csv("path\to\the\folder\where\you\want\save\csv\file")`


### TXT file(s)
(txt file can be read as a CSV file with other separator (delimiter); we suppose below that columns are separated by tabulation):

#### Reading:

`df = pd.read_csv("path\to\the\txt\file\for\reading", sep='\t')`
#### Writing:

`df.to_csv("path\to\the\folder\where\you\want\save\txt\file", sep='\t')`
### JSON files
(an open-standard format that uses human-readable text to transmit data objects consisting of attribute–value pairs. It is the most common data format used for asynchronous browser/server communication. By its view it is very similar to Python dictionary)

#### Reading:

`df = pd.read_json("path\to\the\json\file\for\reading", sep='\t')`
#### Writing:

`df.to_json("path\to\the\folder\where\you\want\save\json\file", sep='\t')`

In [41]:
# To write world_cup Dataframe to a CSV File 
world_cup.to_csv("worldcup.csv")
# To save CSV file without index use index=False attribute

print("File Written!",end="\n\n")

#To check if it was written 
import os
print(os.path.exists('worldcup.csv'))

# reading from it in a new dataframe df
df = pd.read_csv('worldcup.csv')
print(df.head())



File Written!

True
   Unnamed: 0 final score  runner-up   winner  year
0           0         1-0  Argentina  Germany  1990
1           1   0-0 (pen)      Italy   Brazil  1994
2           2         3-0     Brazil   France  1998
3           3         2-0    Germany   Brazil  2002
4           4   1-1 (pen)     France    Italy  2006


In [43]:
# We can also load the data without index as : 
df = pd.read_csv('worldcup.csv',index_col=0)
print(df)

  final score    runner-up   winner  year
0         1-0    Argentina  Germany  1990
1   0-0 (pen)        Italy   Brazil  1994
2         3-0       Brazil   France  1998
3         2-0      Germany   Brazil  2002
4   1-1 (pen)       France    Italy  2006
5         1-0  Netherlands    Spain  2010
6         1-0    Argentina  Germany  2014


In [56]:
movies=pd.read_csv("data/movies.csv",encoding = "ISO-8859-1") 
# encoding is added only for this specific dataset because it gave error with utf-8

In [57]:
movies['release_date'] = movies['release_date'].map(pd.to_datetime)
print(movies.head(20))

#print(movies.describe())

    user_id  movie_id  rating  timestamp   age gender     occupation zip_code  \
0       196       242       3  881250949  49.0      M         writer    55105   
1       305       242       5  886307828  23.0      M     programmer    94086   
2         6       242       4  883268170  42.0      M      executive    98101   
3       234       242       4  891033261  60.0      M        retired    94702   
4        63       242       3  875747190  31.0      M      marketing    75240   
5       181       242       1  878961814  26.0      M      executive    21218   
6       201       242       4  884110598  27.0      M         writer    E2A4H   
7       249       242       5  879571438  25.0      M        student    84103   
8        13       242       2  881515193  47.0      M       educator    29206   
9       279       242       3  877756647  33.0      M     programmer    85251   
10      145       242       5  875269755  31.0      M  entertainment    V3N4P   
11       90       242       

In [54]:
movies_rating = movies['rating']
# Here we are showing only one column, i.e. a Series
print ('type:', type(movies_rating))
movies_rating.head()

type: <class 'pandas.core.series.Series'>


0    3
1    5
2    4
3    4
4    3
Name: rating, dtype: int64

In [63]:
# Filtering data 
# Let's display only women
movies_user_female = movies[movies['gender']=='F']
print(movies_user_female.head())

    user_id  movie_id  rating  timestamp   age gender     occupation zip_code  \
13       18       242       5  880129305  35.0      F          other    37212   
18      123       242       5  879809053   NaN      F         artist    20008   
19      296       242       4  884196057  43.0      F  administrator    16803   
21      270       242       5  876953744  18.0      F        student    63119   
22      240       242       5  885775683  23.0      F       educator    20784   

     movie_title release_date   ...    Fantasy  Film-Noir  Horror  Musical  \
13  Kolya (1996)   1997-01-24   ...          0          0       0        0   
18  Kolya (1996)   1997-01-24   ...          0          0       0        0   
19  Kolya (1996)   1997-01-24   ...          0          0       0        0   
21  Kolya (1996)   1997-01-24   ...          0          0       0        0   
22  Kolya (1996)   1997-01-24   ...          0          0       0        0   

    Mystery  Romance  Sci-Fi  Thriller  War 

In [65]:
#to see all the different values possible for a given column
occupation_list = movies['occupation']
print(occupation_list)

0               writer
1           programmer
2            executive
3              retired
4            marketing
5            executive
6               writer
7              student
8             educator
9           programmer
10       entertainment
11            educator
12            engineer
13               other
14                 NaN
15           marketing
16           scientist
17           executive
18              artist
19       administrator
20             student
21             student
22            educator
23                 NaN
24              writer
25                 NaN
26                 NaN
27           marketing
28       administrator
29             student
             ...      
99970         educator
99971            other
99972            other
99973            other
99974    administrator
99975           artist
99976           artist
99977           artist
99978           artist
99979           artist
99980           artist
99981    entertainment
99982      