In [131]:
import json
import pandas as pd
import requests
import re

In [132]:
# Global variables
school_number = "675"

In [133]:
def make_initial_request(school_number):
    url = f"https://www.ratemyprofessors.com/search/professors/{school_number}?q=*"

    # Make a get request to the URL and store the response as string
    response = requests.get(url).text

    # From the response, parse the schoolID and responseCount

    # Regular expression pattern to match the schoolID
    school_id_pattern = r'"schoolID\\":\\"(.*?)\\",'

    # Regular expression pattern to match the resultCount
    result_count_pattern = r'"resultCount":(\d+),'
    
    # Regular expression pattern to match the school name
    school_name_pattern = r'"__typename":"School","name":"(.*?)"'

    # Search for the patterns in the string
    school_id_match = re.search(school_id_pattern, response)
    result_count_match = re.search(result_count_pattern, response)
    school_name_match = re.search(school_name_pattern, response)

    if school_id_match:
        school_id = school_id_match.group(1)
        print(f"School ID: {school_id}")
    else:
        print("No school ID found.")

    if result_count_match:
        result_count = result_count_match.group(1)
        print(f"Result Count: {result_count}")
    else:
        print("No result count found.")
    
    if school_name_match:
        school_name = school_name_match.group(1)
        print(f"School Name: {school_name}")
    else:
        print("No school name found.")
    
    return {
        "school_id": school_id,
        "result_count": result_count,
        "school_name": school_name
    }

In [134]:
def get_teacher_data(school_number, school_id, result_count, chunk_size=1000):
    url = "https://www.ratemyprofessors.com/graphql"
    headers = {
        "Accept": "*/*",
        "Accept-Language": "en-US,en;q=0.9",
        "Authorization": "Basic dGVzdDp0ZXN0",
        "Connection": "keep-alive",
        "Content-Type": "application/json",
        "Cookie": "ccpa-notice-viewed-02=true",
        "Origin": "https://www.ratemyprofessors.com",
        "Referer": f"https://www.ratemyprofessors.com/search/professors/{school_number}?q=*",
        "Sec-Fetch-Dest": "empty",
        "Sec-Fetch-Mode": "cors",
        "Sec-Fetch-Site": "same-origin",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
        "sec-ch-ua": "\"Google Chrome\";v=\"125\", \"Chromium\";v=\"125\", \"Not.A/Brand\";v=\"24\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"macOS\""
    }

    all_results = []
    has_next_page = True
    cursor = ""

    while has_next_page and len(all_results) < result_count:
        body = {
            "query": """
            query TeacherSearchPaginationQuery(
              $count: Int!
              $cursor: String
              $query: TeacherSearchQuery!
            ) {
              search: newSearch {
                ...TeacherSearchPagination_search_1jWD3d
              }
            }

            fragment TeacherSearchPagination_search_1jWD3d on newSearch {
              teachers(query: $query, first: $count, after: $cursor) {
                didFallback
                edges {
                  node {
                    ...TeacherCard_teacher
                    id
                    __typename
                  }
                }
                pageInfo {
                  hasNextPage
                  endCursor
                }
                resultCount
                filters {
                  field
                  options {
                    value
                    id
                  }
                }
              }
            }

            fragment TeacherCard_teacher on Teacher {
              id
              legacyId
              avgRating
              numRatings
              ...CardFeedback_teacher
              ...CardSchool_teacher
              ...CardName_teacher
              ...TeacherBookmark_teacher
            }

            fragment CardFeedback_teacher on Teacher {
              wouldTakeAgainPercent
              avgDifficulty
            }

            fragment CardSchool_teacher on Teacher {
              department
              school {
                name
                id
              }
            }

            fragment CardName_teacher on Teacher {
              firstName
              lastName
            }

            fragment TeacherBookmark_teacher on Teacher {
              id
              isSaved
            }
            """,
            "variables": {
                "count": min(chunk_size, result_count - len(all_results)),
                "cursor": cursor,
                "query": {
                    "text": "",
                    "schoolID": f"{school_id}",
                    "fallback": True,
                    "departmentID": None
                }
            }
        }

        response = requests.post(url, headers=headers, json=body)
        data = response.json()

        if 'data' in data and 'search' in data['data'] and 'teachers' in data['data']['search']:
            teachers_data = data['data']['search']['teachers']
            if teachers_data['edges']:
                all_results.extend([edge['node'] for edge in teachers_data['edges']])
                has_next_page = teachers_data['pageInfo']['hasNextPage']
                cursor = teachers_data['pageInfo']['endCursor']
            else:
                has_next_page = False
        else:
            has_next_page = False
            print("Error in response:", data)

        # Print status to monitor progress
        print(f"Fetched {len(all_results)} teachers so far...")

    return all_results[:result_count]


In [135]:
data = make_initial_request(school_number)

school_id, result_count, school_name = data

School ID: U2Nob29sLTY3NQ==
Result Count: 5833
School Name: New York University


In [109]:
# Make another request with the result count
data = get_teacher_data(school_number, school_id, result_count)

data

Fetched 1000 teachers so far...
Fetched 2000 teachers so far...
Fetched 3000 teachers so far...
Fetched 4000 teachers so far...
Fetched 5000 teachers so far...
Fetched 5833 teachers so far...


[{'__typename': 'Teacher',
  'avgDifficulty': 0,
  'avgRating': 0,
  'department': 'Social Work',
  'firstName': 'Alma',
  'id': 'VGVhY2hlci0xOTQ1MDQ5',
  'isSaved': False,
  'lastName': 'Carten',
  'legacyId': 1945049,
  'numRatings': 0,
  'school': {'id': 'U2Nob29sLTY3NQ==', 'name': 'New York University'},
  'wouldTakeAgainPercent': -1},
 {'__typename': 'Teacher',
  'avgDifficulty': 0,
  'avgRating': 0,
  'department': 'Music',
  'firstName': 'Fred',
  'id': 'VGVhY2hlci0xOTQ4NDk1',
  'isSaved': False,
  'lastName': 'Carl',
  'legacyId': 1948495,
  'numRatings': 0,
  'school': {'id': 'U2Nob29sLTY3NQ==', 'name': 'New York University'},
  'wouldTakeAgainPercent': -1},
 {'__typename': 'Teacher',
  'avgDifficulty': 0,
  'avgRating': 0,
  'department': 'Communication',
  'firstName': 'Daniel',
  'id': 'VGVhY2hlci0xOTU0MzE2',
  'isSaved': False,
  'lastName': 'Wiley',
  'legacyId': 1954316,
  'numRatings': 0,
  'school': {'id': 'U2Nob29sLTY3NQ==', 'name': 'New York University'},
  'wouldTak

In [121]:
# Extract the required variables and create a DataFrame
parsed_data = []

for entry in data:
    full_name = f"{entry['firstName']} {entry['lastName']}"
    rating = entry['avgRating']
    department = entry['department']
    difficulty = entry['avgDifficulty']
    num_ratings = entry['numRatings']
    
    parsed_data.append({
        'Name': full_name,
        'Average Rating': rating,
        'Department': department,
        'Difficulty Rating': difficulty,
        'Number of Ratings': num_ratings
    })

df = pd.DataFrame(parsed_data)
df

Unnamed: 0,Name,Average Rating,Department,Difficulty Rating,Number of Ratings
0,Alma Carten,0.0,Social Work,0.0,0
1,Fred Carl,0.0,Music,0.0,0
2,Daniel Wiley,0.0,Communication,0.0,0
3,Russell Isaacson,0.0,Marketing,0.0,0
4,Tom Jennings,0.0,Film Television,0.0,0
...,...,...,...,...,...
5828,Elisa DiCaprio,5.0,Social Science,1.0,0
5829,Leslie Ferraro,0.0,Marketing,0.0,0
5830,Linwood Lewis,0.0,Social Work,0.0,0
5831,Seth Watter,4.0,Communication,4.0,1


In [122]:
# Keep only rows with unique (name, department) pairs
df = df.drop_duplicates(subset=['Name', 'Department'], keep='first')

df

Unnamed: 0,Name,Average Rating,Department,Difficulty Rating,Number of Ratings
0,Alma Carten,0.0,Social Work,0.0,0
1,Fred Carl,0.0,Music,0.0,0
2,Daniel Wiley,0.0,Communication,0.0,0
3,Russell Isaacson,0.0,Marketing,0.0,0
4,Tom Jennings,0.0,Film Television,0.0,0
...,...,...,...,...,...
5828,Elisa DiCaprio,5.0,Social Science,1.0,0
5829,Leslie Ferraro,0.0,Marketing,0.0,0
5830,Linwood Lewis,0.0,Social Work,0.0,0
5831,Seth Watter,4.0,Communication,4.0,1


In [123]:
# Remove rows with Number of Ratings of 0
df = df[df['Number of Ratings'] != 0]

df

Unnamed: 0,Name,Average Rating,Department,Difficulty Rating,Number of Ratings
18,Jose Vazquez,4.7,Not Specified,1.5,23
19,Greg D'Amico,3.5,Communication,2.0,7
20,Ned Wilson,3.4,Liberal Studies,2.1,11
21,Tom Meyvis,4.3,Marketing,2.6,7
22,Catherine Fitterman-Radbill,1.8,Music,2.8,18
...,...,...,...,...,...
5822,Lisa Del Rosso,4.4,Liberal Arts & Sciences,3.4,48
5823,Lorenzo Castellano,5.0,Anthropology,2.0,1
5824,Nikolay Kukushkin,4.3,Liberal Studies,3.6,63
5826,Anasse Bari,4.3,Computer Science,3.6,209


In [124]:
# Get mean rating for each department
mean_ratings = df.groupby('Department')['Average Rating'].mean().reset_index()
mean_ratings = mean_ratings.sort_values(by='Average Rating', ascending=False)

mean_ratings

Unnamed: 0,Department,Average Rating
115,Occupational Therapy,5.000000
126,Publishing & Printing,5.000000
65,Genetics,5.000000
107,Natural Sciences,5.000000
20,Arts amp Sciences,5.000000
...,...,...
5,American Studies,2.800000
122,Professional Programs,2.690909
27,Business Law,2.500000
143,Teaching Learning,2.000000


In [125]:
# Get mean difficulty rating for each department
mean_difficulty = df.groupby('Department')['Difficulty Rating'].mean().reset_index()
mean_difficulty = mean_difficulty.sort_values(by='Difficulty Rating', ascending=False)

mean_difficulty

Unnamed: 0,Department,Difficulty Rating
143,Teaching Learning,4.500000
9,Architecture,4.500000
60,Foundations of Am. Culture/Classics,4.300000
81,Interactive Telecommunications,4.250000
22,Biochemistry,4.200000
...,...,...
1,Administration,2.000000
115,Occupational Therapy,2.000000
3,African Studies,1.916667
145,Teaching amp Learning,1.650000


In [126]:
# Get number of ratings for each department
num_ratings = df.groupby('Department')['Number of Ratings'].sum().reset_index()
num_ratings = num_ratings.sort_values(by='Number of Ratings', ascending=False)

num_ratings

Unnamed: 0,Department,Number of Ratings
153,Writing,5096
100,Mathematics,3476
42,Economics,3058
33,Computer Science,2909
123,Psychology,2490
...,...,...
1,Administration,2
20,Arts amp Sciences,2
126,Publishing & Printing,2
65,Genetics,1


In [127]:
# Create a matrix of the mean ratings, mean difficulty ratings, and number of ratings for each department
department_stats = mean_ratings.merge(mean_difficulty, on='Department')
department_stats = department_stats.merge(num_ratings, on='Department')

# Replace ' amp ' with ' & ' in the Department column
department_stats['Department'] = department_stats['Department'].str.replace(' amp ', ' & ')

department_stats

Unnamed: 0,Department,Average Rating,Difficulty Rating,Number of Ratings
0,Occupational Therapy,5.000000,2.000000,1
1,Publishing & Printing,5.000000,3.500000,2
2,Genetics,5.000000,1.000000,1
3,Natural Sciences,5.000000,3.100000,99
4,Arts & Sciences,5.000000,2.500000,2
...,...,...,...,...
149,American Studies,2.800000,3.550000,23
150,Professional Programs,2.690909,3.227273,34
151,Business Law,2.500000,2.700000,7
152,Teaching Learning,2.000000,4.500000,6


In [128]:
# Get departments with 50 or more ratings
department_stats = department_stats[department_stats['Number of Ratings'] >= 50]

department_stats

Unnamed: 0,Department,Average Rating,Difficulty Rating,Number of Ratings
3,Natural Sciences,5.000000,3.100000,99
5,Paleontology,5.000000,3.600000,585
12,Independent Studies,4.800000,3.450000,109
17,Foreign Languages,4.636364,2.872727,82
18,German,4.547368,2.905263,72
...,...,...,...,...
134,Hospitality,3.348148,2.970370,166
136,Nursing,3.291304,3.041304,291
138,Expository Writing,3.180000,3.200000,50
142,International Relations,3.058333,3.333333,82


In [130]:
# Save the df as a json file (nyu-prof-data.json)

df.to_json('nyu-prof-data.json', orient='records')