# Analysis of an international research paper mill

The purpose of the code below is to prove several papers have been given false writers’ names in exchange for money. The code below uses a website to show all the papers that are going to be published/have been published that we know of, as of July 14, 2021. We believe this is one of many websites that allow for people to get their names on papers, so although this will capture some of the papers, it will not get all of the papers that have false names.



## Workspace set-up


In [1]:
#from serpapi import GoogleSearch
import pandas as pd
from textblob import TextBlob
import codecs
import requests
import time
from bs4 import BeautifulSoup
import urllib.request 
from pprint import pprint




---




# Data retrieval

This section is used to retrieve the contracts from the research paper mill using the first contract as the starting link:  http://123mi.ru/1/contract.php?r=1&n=1&m=1.  This code then increments the contracts for each author position for each unique paper.  The code parses the title and cost of the author position (in rubles), which is saved to a data frame, along with the link to the respective contract.  

This code is also optional if you have already saved the outputted dataframe

In [None]:
def fetch_data(contract_number, author_position):
    """Fetch and parse the HTML from the given URL."""
    URL = f'https://web.archive.org/web/20210617233318/http://123mi.ru/1/contract.php?r=1&n={contract_number}&m={author_position}'
    html = urllib.request.urlopen(URL)
    return BeautifulSoup(html, 'html.parser')

def extract_contract_number(parsed_html):
    """Extract contract number from parsed HTML."""
    h1_tag = parsed_html.find("h1")
    if h1_tag:
        font_tag = h1_tag.find("font")
        if font_tag:
            return font_tag.get_text().split('.')[0]
    return None

def extract_titles_and_scopus(parsed_html):
    """Extract Russian and suspected English titles, as well as Scopus data."""
    para = parsed_html.find_all("p")[1]
    scopus_russian_title = para.get_text()
    russian_title_all = scopus_russian_title.split('«')[1].split('»')[0]
    
    # Extracting titles
    if '\n' in russian_title_all:
        russian_title, english_title = russian_title_all.split('\n')
    else:
        russian_title = russian_title_all
        english_title = 'no suspected english title'
    
    # Extracting Scopus data
    try:
        scopus = scopus_russian_title.split('(')[2].split(')')[0]
    except:
        scopus = scopus_russian_title
        
    return russian_title, english_title, scopus

def extract_web_of_science(parsed_html):
    """Extract Web of Science data from parsed HTML."""
    para = parsed_html.find_all("p")[1]
    if 'Web of Science' in para.get_text():
        try:
            web = para.get_text().split('(')[3].split(')')[0]
            return web
        except:
            return para.get_text()
    return False

def extract_price(parsed_html):
    """Extract price data from parsed HTML."""
    para = parsed_html.find_all("p")[6]
    price = para.get_text().split('Общая стоимость услуг, выполняемых Исполнителем в рамках настоящего Договора, составляет ')[1].split(' (')[0]
    return price

def main():
    df_paper = pd.DataFrame()

    for contract_number in range(1, 1009):
        for author_position in range(1, 8):
            parsed_html = fetch_data(contract_number, author_position)

            contract_num = extract_contract_number(parsed_html)
            russian_title, english_title, scopus = extract_titles_and_scopus(parsed_html)
            web_of_science = extract_web_of_science(parsed_html)
            price = extract_price(parsed_html)

            data = {
                'contract_link': [URL],
                'Contract_number': contract_num,
                "Base_title": [russian_title_all],
                'First_title': [russian_title],
                'Suspected_second_title': [english_title],
                'Price_Ruble': price,
                'contract_number': contract_num,
                'Scopus': scopus,
                'Web_of_science': web_of_science
            }
            df = pd.DataFrame(data)
            df_paper = pd.concat([df_paper, df])

    df_paper.reset_index(drop=True, inplace=True)
    df_paper.to_excel('raw_df.xlsx')

if __name__ == '__main__':
    main()


In [None]:
# Backup raw data
df_paper.to_excel('raw_df.xlsx')

# Remove punctuation from specified columns
columns_to_clean = ["Base_title", "First_title", "Suspected_second_title"]
for col in columns_to_clean:
    df_paper[col] = df_paper[col].str.replace('[^\w\s]', '', regex=True)

# Filter out entries where 'Base_title' is empty
df_paper = df_paper[df_paper['Base_title'].str.strip() != ""]

# Remove contracts with 'First_title' length less than 20 characters
df_paper['title_length'] = df_paper['First_title'].str.len()
df_paper = df_paper[df_paper['title_length'] > 20]

# Remove contracts with a price of '0'
df_paper = df_paper[df_paper['Price_Ruble'] != '0']

# Convert 'Price_Ruble' to float and calculate the USD value (based on a given exchange rate)
df_paper['Price_Ruble'] = df_paper['Price_Ruble'].astype('float')
exchange_rate = 0.014  # As of 8/5/2021
df_paper['USD'] = df_paper['Price_Ruble'] * exchange_rate

# Create a separate dataframe for titles, eliminating duplicates
df_titles = df_paper.drop_duplicates(subset=['Base_title'])
df_titles = df_titles[['Contract_number', 'Base_title']]

# Reset index for df_paper
df_paper.reset_index(drop=True, inplace=True)

# Backup of df_paper for potential future use
df_paper_original = df_paper.copy()


# Website Direct Scrape

This is used to scrape the website and find the all the papers that are still being sold

In [15]:
df_paper

Unnamed: 0.1,Unnamed: 0,contract_link,Contract_number,Base_title,First_title,Suspected_second_title,Price_Ruble,contract_number,Scopus,Web_of_science
0,0,http://123mi.ru/1/contract.php?r=1&n=1&m=1,1,Прогноз дорожной ситуации на базе статистическ...,Прогноз дорожной ситуации на базе статистическ...,no suspected english title,57400,1.1,Q2,False
1,1,http://123mi.ru/1/contract.php?r=1&n=1&m=2,1,Прогноз дорожной ситуации на базе статистическ...,Прогноз дорожной ситуации на базе статистическ...,no suspected english title,53300,1.2,Q2,False
2,2,http://123mi.ru/1/contract.php?r=1&n=1&m=3,1,Прогноз дорожной ситуации на базе статистическ...,Прогноз дорожной ситуации на базе статистическ...,no suspected english title,49200,1.3,Q2,False
3,3,http://123mi.ru/1/contract.php?r=1&n=1&m=4,1,Прогноз дорожной ситуации на базе статистическ...,Прогноз дорожной ситуации на базе статистическ...,no suspected english title,45100,1.4,Q2,False
4,4,http://123mi.ru/1/contract.php?r=1&n=1&m=5,1,Прогноз дорожной ситуации на базе статистическ...,Прогноз дорожной ситуации на базе статистическ...,no suspected english title,41000,1.5,Q2,False
...,...,...,...,...,...,...,...,...,...,...
7217,7217,http://123mi.ru/1/contract.php?r=1&n=1998&m=3,1998,Влияние возобновляемых источников энергии и эк...,Влияние возобновляемых источников энергии и эк...,Impact of renewables and economic complexity o...,95940,1998.3,Q2,False
7218,7218,http://123mi.ru/1/contract.php?r=1&n=1998&m=4,1998,Влияние возобновляемых источников энергии и эк...,Влияние возобновляемых источников энергии и эк...,Impact of renewables and economic complexity o...,82000,1998.4,Q2,False
7219,7219,http://123mi.ru/1/contract.php?r=1&n=1999&m=1,1999,"Влияние “3-E” детерминант (экономических, энер...","Влияние “3-E” детерминант (экономических, энер...","Influence of 3-E determinants (economic, energ...",229600,1999.1,Q2,False
7220,7220,http://123mi.ru/1/contract.php?r=1&n=1999&m=2,1999,"Влияние “3-E” детерминант (экономических, энер...","Влияние “3-E” детерминант (экономических, энер...","Influence of 3-E determinants (economic, energ...",123000,1999.2,Q2,False
