# Data Collection: 

# Webscraping basic RateMyProfessors (RMP) info for all professors at UC Davis

In [2]:
# import modules
import pandas as pd
import requests
import requests_cache
import json
import time
from tqdm import tqdm, trange

In [3]:
# set up cache
requests_cache.install_cache("profs")

## Request data from RMP

In [13]:
# endpoint and header for POST request
url = 'https://www.ratemyprofessors.com/graphql'
headers = {
    "Authorization": "Basic dGVzdDp0ZXN0", 
}

### Initial request (first 8 professors)

In [14]:

# data provided for GraphQL
data = {
    "query":"query TeacherSearchPaginationQuery(\n  $count: Int!\n  $cursor: String\n  $query: TeacherSearchQuery!\n) {\n  search: newSearch {\n    ...TeacherSearchPagination_search_1jWD3d\n  }\n}\n\nfragment TeacherSearchPagination_search_1jWD3d on newSearch {\n  teachers(query: $query, first: $count, after: $cursor) {\n    didFallback\n    edges {\n      cursor\n      node {\n        ...TeacherCard_teacher\n        id\n        __typename\n      }\n    }\n    pageInfo {\n      hasNextPage\n      endCursor\n    }\n    resultCount\n    filters {\n      field\n      options {\n        value\n        id\n      }\n    }\n  }\n}\n\nfragment TeacherCard_teacher on Teacher {\n  id\n  legacyId\n  avgRating\n  numRatings\n  ...CardFeedback_teacher\n  ...CardSchool_teacher\n  ...CardName_teacher\n  ...TeacherBookmark_teacher\n}\n\nfragment CardFeedback_teacher on Teacher {\n  wouldTakeAgainPercent\n  avgDifficulty\n}\n\nfragment CardSchool_teacher on Teacher {\n  department\n  school {\n    name\n    id\n  }\n}\n\nfragment CardName_teacher on Teacher {\n  firstName\n  lastName\n}\n\nfragment TeacherBookmark_teacher on Teacher {\n  id\n  isSaved\n}\n",
    "variables":{
        "count":4470, # number of professors
        #"cursor":cursor,
        "query":{
            "text":"",
            "schoolID":"U2Nob29sLTEwNzM=",
            "fallback":True,
            "departmentID":None
        }
    }
} 

In [None]:
# response/check status
response = requests.post(url, headers = headers, json=data)
response.raise_for_status()

In [16]:
# put response into dataframe
cursorDF = pd.DataFrame(response.json()['data']['search']['teachers']['edges'])
cursorDF

Unnamed: 0,cursor,node
0,YXJyYXljb25uZWN0aW9uOjA=,"{'__typename': 'Teacher', 'avgDifficulty': 2.9..."
1,YXJyYXljb25uZWN0aW9uOjE=,"{'__typename': 'Teacher', 'avgDifficulty': 2.7..."
2,YXJyYXljb25uZWN0aW9uOjI=,"{'__typename': 'Teacher', 'avgDifficulty': 3.1..."
3,YXJyYXljb25uZWN0aW9uOjM=,"{'__typename': 'Teacher', 'avgDifficulty': 2.5..."
4,YXJyYXljb25uZWN0aW9uOjQ=,"{'__typename': 'Teacher', 'avgDifficulty': 3.6..."
...,...,...
4465,YXJyYXljb25uZWN0aW9uOjQ0NjU=,"{'__typename': 'Teacher', 'avgDifficulty': 4.2..."
4466,YXJyYXljb25uZWN0aW9uOjQ0NjY=,"{'__typename': 'Teacher', 'avgDifficulty': 3.8..."
4467,YXJyYXljb25uZWN0aW9uOjQ0Njc=,"{'__typename': 'Teacher', 'avgDifficulty': 2.6..."
4468,YXJyYXljb25uZWN0aW9uOjQ0Njg=,"{'__typename': 'Teacher', 'avgDifficulty': 4.2..."


In [21]:
# initialize dataframe
profs = pd.DataFrame(columns=cursorDF['node'][0].keys())
profs

Unnamed: 0,__typename,avgDifficulty,avgRating,department,firstName,id,isSaved,lastName,legacyId,numRatings,school,wouldTakeAgainPercent


In [23]:
# loop to populate profs df
for i in trange(0, len(cursorDF)):
    
    # temporary dictionary
    tempDict = cursorDF['node'][i]
    
    # delete problematic/unnecessary dictionary within dictionary
    del tempDict['school']

    # convert dict to temporary dataframe
    tempDF = pd.DataFrame(tempDict, index=[i])

    # append tempDF to profs df
    profs.loc[len(profs)] = tempDF.iloc[0,:]

  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc

In [24]:
# view profs df
profs

Unnamed: 0,__typename,avgDifficulty,avgRating,department,firstName,id,isSaved,lastName,legacyId,numRatings,school,wouldTakeAgainPercent
0,Teacher,2.9,3.3,Languages,Robert,VGVhY2hlci05NTY2,False,Borgen,9566,39,,-1
1,Teacher,2.7,2.6,Anthropology,Aram,VGVhY2hlci05NTY4,False,Yengoyan,9568,47,,-1
2,Teacher,3.1,3.2,Social Science,Patrick,VGVhY2hlci05NTcw,False,Carroll-Burke,9570,13,,-1
3,Teacher,2.5,4.5,Anthropology,Henry,VGVhY2hlci0xMzQ4MQ==,False,McHenry,13481,124,,-1
4,Teacher,3.6,3.8,Anthropology,Peter,VGVhY2hlci0yMjIzMA==,False,Rodman,22230,73,,-1
...,...,...,...,...,...,...,...,...,...,...,...,...
4465,Teacher,4.2,2.5,Mathematics,Fu,VGVhY2hlci05NTY1OTg=,False,Liu,956598,127,,56.1404
4466,Teacher,3.8,3.6,Economics,David,VGVhY2hlci0xNDczOTYz,False,Lang,1473963,202,,50
4467,Teacher,2.6,3.5,Plant & Soil Science,Abhaya,VGVhY2hlci0xODM5MDQx,False,Dandekar,1839041,7,,66.6667
4468,Teacher,4.2,2.8,Physics,Dina,VGVhY2hlci0xOTgzMDky,False,Zhabinskaya,1983092,89,,48.3871


In [25]:
# rearrange columns
profs = profs[['__typename', 'firstName', 'lastName', 'department', 'id', 'legacyId', 'numRatings', 'avgRating', 'avgDifficulty']]
profs

Unnamed: 0,__typename,firstName,lastName,department,id,legacyId,numRatings,avgRating,avgDifficulty
0,Teacher,Robert,Borgen,Languages,VGVhY2hlci05NTY2,9566,39,3.3,2.9
1,Teacher,Aram,Yengoyan,Anthropology,VGVhY2hlci05NTY4,9568,47,2.6,2.7
2,Teacher,Patrick,Carroll-Burke,Social Science,VGVhY2hlci05NTcw,9570,13,3.2,3.1
3,Teacher,Henry,McHenry,Anthropology,VGVhY2hlci0xMzQ4MQ==,13481,124,4.5,2.5
4,Teacher,Peter,Rodman,Anthropology,VGVhY2hlci0yMjIzMA==,22230,73,3.8,3.6
...,...,...,...,...,...,...,...,...,...
4465,Teacher,Fu,Liu,Mathematics,VGVhY2hlci05NTY1OTg=,956598,127,2.5,4.2
4466,Teacher,David,Lang,Economics,VGVhY2hlci0xNDczOTYz,1473963,202,3.6,3.8
4467,Teacher,Abhaya,Dandekar,Plant & Soil Science,VGVhY2hlci0xODM5MDQx,1839041,7,3.5,2.6
4468,Teacher,Dina,Zhabinskaya,Physics,VGVhY2hlci0xOTgzMDky,1983092,89,2.8,4.2


In [26]:
profs.to_csv("profs.csv")