# Data Collection: 

# Webscraping basic RateMyProfessors (RMP) info for all professors at UC Davis

In [1]:
# import modules
import pandas as pd
import requests
import requests_cache
import json
import time
from tqdm import tqdm, trange

In [2]:
# set up cache
requests_cache.install_cache("profs")

## Request data from RMP

In [3]:
# endpoint and header for POST request
url = 'https://www.ratemyprofessors.com/graphql'
headers = {
    "Authorization": "Basic dGVzdDp0ZXN0", 
}

### Initial request (first 8 professors)

In [4]:

# data provided for GraphQL
data = {
    "query":"query TeacherSearchPaginationQuery(\n  $count: Int!\n  $cursor: String\n  $query: TeacherSearchQuery!\n) {\n  search: newSearch {\n    ...TeacherSearchPagination_search_1jWD3d\n  }\n}\n\nfragment TeacherSearchPagination_search_1jWD3d on newSearch {\n  teachers(query: $query, first: $count, after: $cursor) {\n    didFallback\n    edges {\n      cursor\n      node {\n        ...TeacherCard_teacher\n        id\n        __typename\n      }\n    }\n    pageInfo {\n      hasNextPage\n      endCursor\n    }\n    resultCount\n    filters {\n      field\n      options {\n        value\n        id\n      }\n    }\n  }\n}\n\nfragment TeacherCard_teacher on Teacher {\n  id\n  legacyId\n  avgRating\n  numRatings\n  ...CardFeedback_teacher\n  ...CardSchool_teacher\n  ...CardName_teacher\n  ...TeacherBookmark_teacher\n}\n\nfragment CardFeedback_teacher on Teacher {\n  wouldTakeAgainPercent\n  avgDifficulty\n}\n\nfragment CardSchool_teacher on Teacher {\n  department\n  school {\n    name\n    id\n  }\n}\n\nfragment CardName_teacher on Teacher {\n  firstName\n  lastName\n}\n\nfragment TeacherBookmark_teacher on Teacher {\n  id\n  isSaved\n}\n",
    "variables":{
        "count":10000, # number of professors
        #"cursor":cursor,
        "query":{
            "text":"",
            "schoolID":"U2Nob29sLTEwNzM=",
            "fallback":True,
            "departmentID":None
        }
    }
} 

In [5]:
# response/check status
response = requests.post(url, headers = headers, json=data)
response.raise_for_status()

In [6]:
# put response into dataframe
cursorDF = pd.DataFrame(response.json()['data']['search']['teachers']['edges'])
cursorDF

Unnamed: 0,cursor,node
0,YXJyYXljb25uZWN0aW9uOjA=,"{'__typename': 'Teacher', 'avgDifficulty': 2.9..."
1,YXJyYXljb25uZWN0aW9uOjE=,"{'__typename': 'Teacher', 'avgDifficulty': 2.7..."
2,YXJyYXljb25uZWN0aW9uOjI=,"{'__typename': 'Teacher', 'avgDifficulty': 3.1..."
3,YXJyYXljb25uZWN0aW9uOjM=,"{'__typename': 'Teacher', 'avgDifficulty': 2.5..."
4,YXJyYXljb25uZWN0aW9uOjQ=,"{'__typename': 'Teacher', 'avgDifficulty': 3.6..."
...,...,...
4466,YXJyYXljb25uZWN0aW9uOjQ0NjY=,"{'__typename': 'Teacher', 'avgDifficulty': 4.3..."
4467,YXJyYXljb25uZWN0aW9uOjQ0Njc=,"{'__typename': 'Teacher', 'avgDifficulty': 4.7..."
4468,YXJyYXljb25uZWN0aW9uOjQ0Njg=,"{'__typename': 'Teacher', 'avgDifficulty': 3.7..."
4469,YXJyYXljb25uZWN0aW9uOjQ0Njk=,"{'__typename': 'Teacher', 'avgDifficulty': 4, ..."


In [7]:
# initialize dataframe
profs = pd.DataFrame(columns=cursorDF['node'][0].keys())
profs

Unnamed: 0,__typename,avgDifficulty,avgRating,department,firstName,id,isSaved,lastName,legacyId,numRatings,school,wouldTakeAgainPercent


In [8]:
# loop to populate profs df
for i in trange(0, len(cursorDF)):
    
    # temporary dictionary
    tempDict = cursorDF['node'][i]
    
    # delete problematic/unnecessary dictionary within dictionary
    del tempDict['school']

    # convert dict to temporary dataframe
    tempDF = pd.DataFrame(tempDict, index=[i])

    # append tempDF to profs df
    profs.loc[len(profs)] = tempDF.iloc[0,:]

  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc[len(profs)] = tempDF.iloc[0,:]
  profs.loc

In [9]:
# view profs df
profs

Unnamed: 0,__typename,avgDifficulty,avgRating,department,firstName,id,isSaved,lastName,legacyId,numRatings,school,wouldTakeAgainPercent
0,Teacher,2.9,3.3,Languages,Robert,VGVhY2hlci05NTY2,False,Borgen,9566,39,,-1
1,Teacher,2.7,2.6,Anthropology,Aram,VGVhY2hlci05NTY4,False,Yengoyan,9568,47,,-1
2,Teacher,3.1,3.2,Social Science,Patrick,VGVhY2hlci05NTcw,False,Carroll-Burke,9570,13,,-1
3,Teacher,2.5,4.5,Anthropology,Henry,VGVhY2hlci0xMzQ4MQ==,False,McHenry,13481,124,,-1
4,Teacher,3.6,3.8,Anthropology,Peter,VGVhY2hlci0yMjIzMA==,False,Rodman,22230,73,,-1
...,...,...,...,...,...,...,...,...,...,...,...,...
4466,Teacher,4.3,2.3,Computer Science,Chris,VGVhY2hlci00NjU5MTM=,False,Nitta,465913,189,,21.3115
4467,Teacher,4.7,2,Computer Science,Daryl,VGVhY2hlci0yNTA5NTcy,False,Posnett,2509572,57,,27.451
4468,Teacher,3.7,1.6,Science,Daniel,VGVhY2hlci0xMjUyNjg=,False,Ferenc,125268,29,,9.0909
4469,Teacher,4,3.7,Biology,Kenneth,VGVhY2hlci0xNTE4OTQ=,False,Hilt,151894,325,,56.1798


In [10]:
# tidy columns
profs = profs[['__typename', 'firstName', 'lastName', 'department', 'id', 'legacyId', 'numRatings', 'avgRating', 'avgDifficulty']]
profs

Unnamed: 0,__typename,firstName,lastName,department,id,legacyId,numRatings,avgRating,avgDifficulty
0,Teacher,Robert,Borgen,Languages,VGVhY2hlci05NTY2,9566,39,3.3,2.9
1,Teacher,Aram,Yengoyan,Anthropology,VGVhY2hlci05NTY4,9568,47,2.6,2.7
2,Teacher,Patrick,Carroll-Burke,Social Science,VGVhY2hlci05NTcw,9570,13,3.2,3.1
3,Teacher,Henry,McHenry,Anthropology,VGVhY2hlci0xMzQ4MQ==,13481,124,4.5,2.5
4,Teacher,Peter,Rodman,Anthropology,VGVhY2hlci0yMjIzMA==,22230,73,3.8,3.6
...,...,...,...,...,...,...,...,...,...
4466,Teacher,Chris,Nitta,Computer Science,VGVhY2hlci00NjU5MTM=,465913,189,2.3,4.3
4467,Teacher,Daryl,Posnett,Computer Science,VGVhY2hlci0yNTA5NTcy,2509572,57,2,4.7
4468,Teacher,Daniel,Ferenc,Science,VGVhY2hlci0xMjUyNjg=,125268,29,1.6,3.7
4469,Teacher,Kenneth,Hilt,Biology,VGVhY2hlci0xNTE4OTQ=,151894,325,3.7,4


In [11]:
# these should be removed. professors with no ratings
profs[profs['avgRating'] < 1]

Unnamed: 0,__typename,firstName,lastName,department,id,legacyId,numRatings,avgRating,avgDifficulty
141,Teacher,Paul,Baumann,Biology,VGVhY2hlci0xNTkyODg=,159288,0,0,0
199,Teacher,Te,Williams,Agriculture,VGVhY2hlci0yMTQ2NzI=,214672,0,0,0
224,Teacher,Scott,Schonfeldt-Aultman,Ethnic Studies,VGVhY2hlci0yMjg4NDQ=,228844,0,0,0
237,Teacher,Craig,McDonald,Medicine,VGVhY2hlci0yNDczNDU=,247345,0,0,0
255,Teacher,Sharlene,Gilman,English,VGVhY2hlci0yNTg2ODc=,258687,0,0,0
...,...,...,...,...,...,...,...,...,...
4385,Teacher,George,Hegarty,University Writing Program,VGVhY2hlci0yODQxMzM3,2841337,0,0,0
4386,Teacher,Sen-Ching,Cheung,Computer Science,VGVhY2hlci0yODQxNDIw,2841420,0,0,0
4396,Teacher,Alessandro,Ossola,Environmental Science,VGVhY2hlci0yODQ0MDk3,2844097,0,0,0
4410,Teacher,Graham,McDougal,Art,VGVhY2hlci0yNjc2MzU4,2676358,0,0,0


In [12]:
# drop rows with 0 ratings
idxRemove = profs[profs['avgRating'] < 1].index
profs = profs.drop(idxRemove)

In [13]:
# save profs df to csv
profs.to_csv("profs.csv")