# pandas concatenation of DataFrame

In [1]:
import pandas as pd 

In [2]:
ratings1 = pd.read_csv('data/ratings1.csv', index_col=False)
ratings2 = pd.read_csv('data/ratings2.csv', index_col=False)
movies = pd.read_csv('data/movies.csv', index_col=False)
dates = pd.read_csv('data/dates.csv', index_col=False)

### .concat()
* objs= list objects for concatanetion
* axis=0 as default concatanetion by rows
* axis=1 concatanetion by columns
* join='inner' intersection
* join='outer' combine
* ignore_index=False as default keep original indexes
* ignore_index=True replace original indexes with new ordinal indexes

In [3]:
ratings = pd.concat([ratings1, ratings2], ignore_index=True)
ratings.info()
dates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100837 entries, 0 to 100836
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   userId   100837 non-null  int64  
 1   movieId  100837 non-null  int64  
 2   rating   100837 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 2.3 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   date    100836 non-null  object
dtypes: object(1)
memory usage: 787.9+ KB


### .drop_duplicates()
* subset= column label or sequence of lebals. as default use all columns
* keep='first' drop duplicates except for the first occurrence
* keep='last' drop duplicates except fro the last occurrence
* keep=False drop all duplicates
* inplace=False as default to modify DF
* inplace=True creat a new DF
* ignore_index=False as default keep original indexes
* ignore_index=True replace original indexes with new ordinal indexes

In [4]:
ratings = ratings.drop_duplicates(ignore_index=True)
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   userId   100836 non-null  int64  
 1   movieId  100836 non-null  int64  
 2   rating   100836 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 2.3 MB


In [5]:
ratings_dates = pd.concat([ratings, dates], axis=1)
ratings_dates.head()

Unnamed: 0,userId,movieId,rating,date
0,1,1,4.0,2000-07-30 18:45:03
1,1,3,4.0,2000-07-30 18:20:47
2,1,6,4.0,2000-07-30 18:37:04
3,1,47,5.0,2000-07-30 19:03:35
4,1,50,5.0,2000-07-30 18:48:51


# concatenation types SQL

### inner join
* returns common elements for both tables. different elements are removed
### outer join
* full outer join returns all rows, if there is match in either left or right table, or NULL
* left outer join returns all rows in left table and if there is match in right table, or NULL
* right outer join returns all rows in right table and if there is match in left table, or NULL 

![alt text](dst3-u1-md12_7_1.png)

# .set_index()
* keys= can be either a single column key, a single array of the same length as the calling DataFrame
* drop=True by default delete columns to be used as the new index
* append=False by default to append columns to existing index
* inplace=False as default to modify DF
* inplace=True creat a new DF

# concatenation types pandas 

### .join()
* other= table which concatenate 'right' table
* on= column in caller to join on index in other
* how='left' use calling DF index
* how='right' use other's index
* how='outer' use union indexes of calling DF and other's indexes
* how='inner' use intersection indexes of calling DF and other's indexes
* how='cross' creates the cartesian product from both frames, preserves the order of the left keys
* lsuffix suffix to use from left frame’s overlapping columns
* rsuffix suffix to use from right frame’s overlapping columns
* sort=False by default


In [10]:
ratings_dates.head()

Unnamed: 0,userId,movieId,rating,date
0,1,1,4.0,2000-07-30 18:45:03
1,1,3,4.0,2000-07-30 18:20:47
2,1,6,4.0,2000-07-30 18:37:04
3,1,47,5.0,2000-07-30 19:03:35
4,1,50,5.0,2000-07-30 18:48:51


In [11]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [12]:
joined = ratings_dates.join(
    movies.set_index('movieId'),
    on='movieId',
    how='left'
)
joined.head()

Unnamed: 0,userId,movieId,rating,date,title,genres
0,1,1,4.0,2000-07-30 18:45:03,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,2000-07-30 18:20:47,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,2000-07-30 18:37:04,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,2000-07-30 19:03:35,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,2000-07-30 18:48:51,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


### .merge()
* right= object to merge with
* how='left' use keys from left frame
* how='right' use keys from right frame
* how='outer' use union of keys from both frames
* how='inner' use intersection of keys from both frames
* how='cross' creates the cartesian product from both frames
* on column or index to join on
* left_on column or index to join on in the left DF
* right_on column or index to join on in the right DF
* left_index=False by default use indexes from the left DF as join key
* right_index=False by default use indexes from the right DF as join key
* sort=False by default
* suffixes list-like, default is (“_x”, “_y”)
* copy=True by default
* indicator=bool or str, default False

In [15]:
merged = ratings_dates.merge(
    right=movies,
    on='movieId',
    how='left'
)
merged.head()

Unnamed: 0,userId,movieId,rating,date,title,genres
0,1,1,4.0,2000-07-30 18:45:03,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,2000-07-30 18:20:47,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,2000-07-30 18:37:04,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,2000-07-30 19:03:35,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,2000-07-30 18:48:51,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


![alt text](dst3-u1-md12_7_10.png)

### concatenation DateFrame prctice 

In [16]:
items_df = pd.DataFrame({
    'item_id': [417283, 849734, 132223, 573943, 19475, 3294095, 382043, 302948, 100132, 312394],
    'vendor': ['Samsung', 'LG', 'Apple', 'Apple', 'LG', 'Apple', 'Samsung', 'Samsung', 'LG', 'ZTE'],
    'stock_count': [54, 33, 122, 18, 102, 43, 77, 143, 60, 19]
})

purchase_df = pd.DataFrame({
    'purchase_id': [101, 101, 101, 112, 121, 145, 145, 145, 145, 221],
    'item_id': [417283, 849734, 132223, 573943, 19475, 3294095, 382043, 302948, 103845, 100132],
    'price': [13900, 5330, 38200, 49990, 9890, 33000, 67500, 34500, 89900, 11400]
})

In [23]:
display(items_df.info())
display(purchase_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   item_id      10 non-null     int64 
 1   vendor       10 non-null     object
 2   stock_count  10 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 372.0+ bytes


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   purchase_id  10 non-null     int64
 1   item_id      10 non-null     int64
 2   price        10 non-null     int64
dtypes: int64(3)
memory usage: 372.0 bytes


None