<a href="https://colab.research.google.com/github/SurajKande/Pipelining/blob/master/ETL_casestudy_recommended_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Complete the connection URI 
import sqlalchemy

connection_uri = "postgresql://repl:password@localhost:5432/courses_application"    
# repl = username,   password = password

db_engines = sqlalchemy.create_engine(connection_uri)

In [0]:
# Extract the rating data into a DataFrame    
def extract_rating_data(db_engines):
  return pd.read_sql("SELECT * FROM rating", db_engines["courses_application"])

def extract_course_data(db_engines):
  return pd.read_sql("SELECT * FROM course", db_engines["courses_application"])

In [0]:
def transform_avg_rating(rating_data):

   # Group by course_id and extract average rating per course
  avg_rating = rating_data.groupby('course_id').rating.mean()        

   # Return sorted average ratings per course
  sort_rating = avg_rating.sort_values(ascending=False).reset_index()     
   
  return sort_rating

In [0]:
def transform_fill_programming_language(course_data):
    imputed = course_data.fillna({"programming_language": "python"})
    return imputed        

In [0]:
def transform_recommendations(avg_course_ratings, courses_to_recommend):

    # Merge both DataFrames
    merged = courses_to_recommend.merge(avg_course_ratings)                           
    
    # Sort values by rating and group by user_id
    grouped = merged.sort_values("rating", ascending = False).groupby('user_id')      

    # Produce the top 3 values and sort by user_id
    recommendations = grouped.head(3).sort_values("user_id").reset_index()           

    final_recommendations = recommendations[["user_id", "course_id","rating"]]

    # Return final recommendations  
    return final_recommendations               

In [0]:
#o put the data into a database so that it can be used
connection_uri = "postgresql://repl:password@localhost:5432/data_warehouseh"
db_engine = sqlalchemy.create_engine(connection_uri)

def load_to_data_warehouse(recommendations):
    recommendations.to_sql("recommendations", db_engine, if_exists="replace")

In [0]:
#defining the etl function to perform the schedule daily jobs
def etl(db_engines):
  #extracting the data
  course_dataframe = extract_course_data(db_engines)
  rating_dataframe = extract_rating_data(db_engines)

  # Clean up data
  course_dataframe = extract_course_data(db_engines)
  print(course_dataframe.isnull().sum())

  course_dataframe = transform_fill_programming_language(course_dataframe)

  #get average rating of courses
  avg_course_data = transform_avg_rating(rating_dataframe)

  courses_to_recommend = transform_courses_to_recommend(rating_dataframe, course_dataframe)

  #calculate recommendations
  recommendations = transform_recommendations(avg_course_data, courses_to_recommend)

  #load to the database
  load_to_data_warehouse(recommendations, db_engine)

In [0]:
# Define the DAG so it runs on a daily basis
dag = DAG(dag_id="recommendations",
          schedule_interval='0 0 * * *')

# Make sure `etl()` is called in the operator. Pass the correct kwargs.
task_recommendations = PythonOperator(
    task_id="recommendations_task",
    python_callable=etl,
    op_kwargs={"db_engines": db_engines},
)

In [0]:
# after uploading the data to the data warehouse

def recommendations_for_user(user_id, threshold=4.5):
  # Join with the courses table
  query = """
  SELECT title, rating FROM recommendations
    INNER JOIN courses ON courses.course_id = recommendations.course_id
    WHERE user_id=%(user_id)s AND rating>%(threshold)s
    ORDER BY rating DESC
  """
  # Add the threshold parameter
  predictions_dataframe = pd.read_sql(query, db_engine, params = {"user_id": user_id, 
                                                           "threshold": threshold})
  return predictions_dataframe.title.values

In [0]:
# Try the function you created
print(recommendations_for_user(12, 4.65))