# Analysis of an international research paper mill

The purpose of the code below is to prove several papers have been given false writers’ names in exchange for money. The code below uses a website to show all the papers that are going to be published/have been published that we know of, as of July 14, 2021. We believe this is one of many websites that allow for people to get their names on papers, so although this will capture some of the papers, it will not get all of the papers that have false names.



## Workspace set-up


In [1]:
from serpapi import GoogleSearch
import pandas as pd
from textblob import TextBlob
import codecs
import requests
import time
from bs4 import BeautifulSoup
import urllib.request 
from pprint import pprint




---




# Data retrieval

This section is used to retrieve the contracts from the research paper mill using the first contract as the starting link:  http://123mi.ru/1/contract.php?r=1&n=1&m=1.  This code then increments the contracts for each author position for each unique paper.  The code parses the title and cost of the author position (in rubles), which is saved to a data frame, along with the link to the respective contract.  

This code is also optional if you have already saved the outputted dataframe

In [2]:
#prep
df_paper = pd.DataFrame()

# Set the number of papers / contracts to pull 
for contract_number in range(1, 2000):
    # Set the number of author positions to pull
    for author_position in range(1, 8):
        # The starting link is http://123mi.ru/1/contract.php?r=1&n=1&m=1
        # A backup for the web archive is: https://web.archive.org/web/20201031161224/http://123mi.ru/1/contract.php?r=1&n=1000&m=1
        # This changes the link every loop to ajust to what we need. 
        URL = 'http://123mi.ru/1/contract.php?r=1&n={}&m={}'.format(contract_number, author_position)
        html = urllib.request.urlopen(URL)
        
        # Parse the html file
        htmlParse = BeautifulSoup(html, 'html.parser')
        for par in htmlParse.find_all("h1"):
            for para in par.find_all("font"):
                coun = para.get_text()
                print(coun)
        # Get all the paragraphs
        count = 0
        for para in htmlParse.find_all("p"):
            count += 1
            # This is a semi hard coded way to find all the paragraphs and check how many it has looped over.
            #print(para)
            if count == 2:
                # Find/retreave the Russian titles
                scopus_Russian_title = para.get_text()
                #everything = scopus_Russian_title
                Russian_title = scopus_Russian_title.split('«')
                Russian_title_all = Russian_title[1].split('»')
                #print(Russian_title[0])
                # This makes sure there isn't a traceback that happens here because the code can take a while. 
                # The point of this code it to split the data to make it easier to find the nessary parts. 
                try:
                    scopus = Russian_title[0].split('(')
                    scopus = scopus[2].split(')')
                    scopus = scopus[0]
                    
                except:
                    scopus = scopus_Russian_title
                    Russian_title = Russian_title[1].split('»')
                    
                # This is used to check if there is multiple titles, because they are always seperated by a \n. 
                if '\n' in Russian_title_all[0]:
                    split = Russian_title_all[0].split('\n')
                    rushed_title = split[0]
                    english_titl = split[1]
                    
                else:
                    # if there is not a \n, then the code will just take the title thats there. 
                    rushed_title = Russian_title_all[0]
                    #print(Russian_title[0])
                    english_titl = 'no suspected english title'
                    
            if count == 2:
                web = para.get_text()
                if 'Web of Science' in web:
                    contains = True
                else:
                    contains = False
                
            if count == 7:
                # Retrieve the price from the data
                Price_Ruble = para.get_text()
                #print(Price_Ruble)
                Price_Ruble = Price_Ruble.split('<font color="#0000FF">')
                Price_Ruble = Price_Ruble[0].split('Общая стоимость услуг, выполняемых Исполнителем в рамках настоящего Договора, составляет ')
                Price_Ruble = Price_Ruble[1].split(' (')
                
            
            
            
        # This takes all the data and puts it into a large dataframe.
        if count == 7:
            break
        data = {'contract_link': [URL], "Base_title" : [Russian_title_all[0]], 'First_title': [rushed_title], 'Suspected_second_title' : [english_titl], 'Price_Ruble': Price_Ruble[0], 'contract_number': coun, 'Scopus': scopus, 'Web_of_science' : contains}
        #print(data)
        df = pd.DataFrame(data) 
        df_paper = df_paper.append(df)
df_paper = df_paper.reset_index()
df_paper = df_paper.drop('index', axis = 1)
# This can also be removed to not show the dataframe at the end. 
df_paper
df_paper.to_excel('raw_df.xlsx')


1.1
1.2
1.3
1.4
1.5
1.6
2.1
2.2
2.3
2.4
3.1
3.2
3.3
3.4
4.1
4.2
4.3
4.4
5.1
5.2
5.3
5.4
6.1
6.2
6.3
6.4
7.1
7.2
7.3
7.4
8.1
8.2
8.3
8.4
9.1
9.2
9.3
9.4
9.5
10.1
10.2
10.3
10.4
10.5
11.1
11.2
11.3
11.4
11.5
11.6
12.1
12.2
12.3
12.4
12.5
12.6
13.1
13.2
13.3
13.4
13.5
13.6
14.1
14.2
14.3
14.4
15.1
15.2
15.3
15.4
16.1
16.2
16.3
16.4
16.5
16.6
17.1
17.2
17.3
17.4
17.5
18.1
18.2
18.3
18.4
18.5
19.1
19.2
19.3
19.4
19.5
19.6
20.1
20.2
20.3
20.4
20.5
20.6
21.1
21.2
21.3
21.4
22.1
22.2
22.3
22.4
23.1
23.2
23.3
23.4
24.1
24.2
24.3
24.4
24.5
24.6
25.1
25.2
25.3
25.4
25.5
25.6
26.1
26.2
26.3
26.4
27.1
27.2
27.3
27.4
28.1
28.2
28.3
28.4
28.5
28.6
29.1
29.2
29.3
29.4
30.1
30.2
30.3
30.4
31.1
31.2
31.3
31.4
32.1
32.2
32.3
32.4
32.5
32.6
33.1
33.2
33.3
33.4
33.5
33.6
34.1
34.2
34.3
34.4
34.5
34.6
35.1
35.2
35.3
35.4
36.1
36.2
36.3
36.4
37.1
37.2
37.3
37.4
38.1
38.2
38.3
38.4
39.1
39.2
39.3
39.4
40.1
40.2
40.3
40.4
41.1
41.2
41.3
41.4
42.1
42.2
42.3
42.4
43.1
43.2
43.3
43.4
43.5
43.6
44.1
44.2
44.3
44.4

313.1
313.2
313.3
313.4
313.5
313.6
314.1
314.2
314.3
314.4
314.5
314.6
315.1
315.2
315.3
315.4
315.5
316.1
316.2
316.3
316.4
316.5
316.6
317.1
317.2
317.3
317.4
317.5
318.1
318.2
318.3
318.4
318.5
318.6
319.1
319.2
319.3
319.4
319.5
320.1
320.2
320.3
320.4
320.5
320.6
321.1
321.2
321.3
321.4
321.5
322.1
322.2
322.3
322.4
322.5
322.6
323.1
323.2
323.3
323.4
323.5
324.1
324.2
324.3
324.4
324.5
324.6
325.1
325.2
325.3
325.4
325.5
326.1
326.2
326.3
326.4
326.5
326.6
327.1
328.1
329.1
329.2
329.3
329.4
330.1
330.2
330.3
330.4
330.5
331.1
331.2
331.3
331.4
332.1
332.2
332.3
332.4
332.5
333.1
333.2
333.3
333.4
333.5
333.6
334.1
334.2
334.3
334.4
334.5
335.1
335.2
335.3
335.4
335.5
336.1
336.2
336.3
336.4
336.5
336.6
337.1
337.2
337.3
337.4
337.5
337.6
338.1
338.2
338.3
338.4
338.5
338.6
339.1
339.2
339.3
339.4
339.5
340.1
340.2
340.3
340.4
341.1
341.2
341.3
341.4
341.5
341.6
342.1
342.2
342.3
342.4
343.1
343.2
343.3
343.4
343.5
344.1
344.2
344.3
344.4
344.5
344.6
345.1
345.2
345.3
345.4
346.

631.3
631.4
632.1
632.2
632.3
632.4
633.1
633.2
633.3
633.4
633.5
634.1
634.2
634.3
634.4
634.5
635.1
635.2
635.3
635.4
636.1
636.2
636.3
636.4
637.1
637.2
637.3
637.4
637.5
638.1
638.2
638.3
638.4
639.1
639.2
639.3
639.4
639.5
639.6
640.1
640.2
640.3
640.4
640.5
641.1
641.2
641.3
641.4
642.1
642.2
642.3
642.4
642.5
642.6
643.1
643.2
643.3
643.4
643.5
644.1
644.2
644.3
644.4
645.1
645.2
645.3
645.4
645.5
645.6
646.1
646.2
646.3
646.4
646.5
647.1
647.2
647.3
647.4
648.1
648.2
648.3
648.4
648.5
648.6
649.1
649.2
649.3
649.4
649.5
650.1
650.2
650.3
650.4
651.1
651.2
651.3
651.4
651.5
651.6
652.1
652.2
652.3
652.4
652.5
653.1
653.2
653.3
653.4
654.1
654.2
654.3
654.4
654.5
654.6
655.1
655.2
655.3
655.4
655.5
656.1
656.2
656.3
656.4
657.1
657.2
657.3
657.4
658.1
658.2
658.3
658.4
659.1
659.2
659.3
659.4
659.5
660.1
660.2
660.3
660.4
661.1
661.2
661.3
661.4
662.1
663.1
663.2
663.3
663.4
663.5
664.1
664.2
664.3
664.4
665.1
665.2
665.3
665.4
666.1
666.2
666.3
666.4
667.1
667.2
667.3
667.4
667.

941.3
941.4
942.1
942.2
942.3
942.4
942.5
942.6
943.1
943.2
943.3
943.4
943.5
943.6
944.1
944.2
944.3
944.4
944.5
944.6
945.1
945.2
945.3
945.4
945.5
946.1
946.2
946.3
946.4
946.5
947.1
947.2
947.3
947.4
947.5
948.1
948.2
948.3
948.4
948.5
949.1
949.2
949.3
949.4
950.1
950.2
950.3
950.4
950.5
951.1
951.2
951.3
951.4
952.1
952.2
952.3
952.4
952.5
953.1
953.2
953.3
953.4
953.5
953.6
954.1
954.2
954.3
954.4
954.5
955.1
955.2
955.3
955.4
956.1
956.2
956.3
956.4
956.5
957.1
957.2
957.3
957.4
958.1
958.2
958.3
958.4
958.5
959.1
959.2
959.3
959.4
959.5
960.1
960.2
960.3
960.4
960.5
960.6
961.1
961.2
961.3
961.4
962.1
962.2
962.3
962.4
963.1
963.2
963.3
963.4
963.5
964.1
964.2
964.3
964.4
965.1
965.2
965.3
965.4
965.5
966.1
966.2
966.3
966.4
967.1
967.2
967.3
967.4
967.5
967.6
968.1
968.2
968.3
968.4
968.5
969.1
969.2
969.3
969.4
969.5
970.1
970.2
970.3
970.4
970.5
971.1
971.2
971.3
971.4
972.1
972.2
972.3
972.4
972.5
973.1
973.2
973.3
973.4
973.5
974.1
974.2
974.3
974.4
974.5
974.6
975.1
975.

1197.5
1198.1
1198.2
1198.3
1198.4
1199.1
1199.2
1199.3
1199.4
1199.5
1200.1
1200.2
1200.3
1200.4
1200.5
1200.6
1201.1
1201.2
1201.3
1201.4
1201.5
1202.1
1202.2
1202.3
1202.4
1202.5
1203.1
1203.2
1203.3
1203.4
1204.1
1204.2
1204.3
1204.4
1204.5
1204.6
1205.1
1205.2
1205.3
1205.4
1205.5
1206.1
1206.2
1206.3
1206.4
1206.5
1206.6
1207.1
1207.2
1207.3
1207.4
1207.5
1208.1
1208.2
1208.3
1208.4
1208.5
1208.6
1209.1
1209.2
1209.3
1209.4
1209.5
1210.1
1210.2
1210.3
1210.4
1211.1
1211.2
1211.3
1211.4
1211.5
1212.1
1212.2
1212.3
1212.4
1213.1
1213.2
1213.3
1213.4
1213.5
1214.1
1214.2
1214.3
1214.4
1214.5
1214.6
1215.1
1215.2
1215.3
1215.4
1215.5
1216.1
1216.2
1216.3
1216.4
1217.1
1217.2
1217.3
1217.4
1217.5
1218.1
1218.2
1218.3
1218.4
1218.5
1218.6
1219.1
1219.2
1219.3
1219.4
1219.5
1220.1
1220.2
1220.3
1220.4
1221.1
1221.2
1221.3
1221.4
1222.1
1222.2
1222.3
1222.4
1222.5
1223.1
1223.2
1223.3
1223.4
1223.5
1223.6
1224.1
1224.2
1224.3
1224.4
1224.5
1225.1
1225.2
1225.3
1225.4
1225.5
1226.1
1226.2

1450.2
1450.3
1450.4
1450.5
1451.1
1451.2
1451.3
1451.4
1451.5
1452.1
1452.2
1452.3
1452.4
1452.5
1453.1
1453.2
1453.3
1453.4
1453.5
1453.6
1454.1
1454.2
1454.3
1454.4
1454.5
1455.1
1455.2
1455.3
1455.4
1455.5
1456.1
1456.2
1456.3
1456.4
1457.1
1457.2
1457.3
1457.4
1457.5
1458.1
1458.2
1458.3
1458.4
1459.1
1459.2
1459.3
1459.4
1460.1
1460.2
1460.3
1460.4
1460.5
1460.6
1461.1
1461.2
1461.3
1461.4
1462.1
1462.2
1462.3
1462.4
1462.5
1463.1
1464.1
1464.2
1464.3
1464.4
1465.1
1465.2
1465.3
1465.4
1465.5
1466.1
1466.2
1466.3
1466.4
1466.5
1466.6
1467.1
1467.2
1467.3
1467.4
1468.1
1468.2
1468.3
1468.4
1468.5
1469.1
1469.2
1469.3
1469.4
1469.5
1469.6
1470.1
1470.2
1470.3
1470.4
1470.5
1470.6
1470.7
1471.1
1471.2
1471.3
1471.4
1471.5
1472.1
1472.2
1472.3
1472.4
1473.1
1473.2
1473.3
1473.4
1474.1
1474.2
1474.3
1474.4
1474.5
1475.1
1475.2
1475.3
1475.4
1475.5
1476.1
1476.2
1476.3
1476.4
1476.5
1476.6
1477.1
1477.2
1477.3
1477.4
1477.5
1478.1
1478.2
1478.3
1478.4
1479.1
1479.2
1479.3
1479.4
1479.5

1689.3
1689.4
1690.1
1690.2
1690.3
1690.4
1690.5
1691.1
1691.2
1691.3
1691.4
1692.1
1692.2
1692.3
1692.4
1693.1
1693.2
1693.3
1693.4
1694.1
1694.2
1694.3
1694.4
1694.5
1695.1
1695.2
1695.3
1695.4
1695.5
1695.6
1696.1
1696.2
1696.3
1696.4
1696.5
1697.1
1697.2
1697.3
1697.4
1697.5
1698.1
1698.2
1698.3
1698.4
1698.5
1698.6
1699.1
1699.2
1699.3
1699.4
1699.5
1700.1
1700.2
1700.3
1700.4
1700.5
1701.1
1701.2
1701.3
1701.4
1702.1
1702.2
1702.3
1702.4
1702.5
1703.1
1703.2
1703.3
1703.4
1703.5
1704.1
1704.2
1704.3
1704.4
1704.5
1705.1
1705.2
1705.3
1705.4
1705.5
1706.1
1706.2
1706.3
1706.4
1707.1
1707.2
1707.3
1707.4
1707.5
1708.1
1708.2
1708.3
1708.4
1708.5
1709.1
1709.2
1709.3
1709.4
1709.5
1709.6
1710.1
1710.2
1710.3
1710.4
1711.1
1711.2
1711.3
1711.4
1712.1
1712.2
1712.3
1712.4
1712.5
1713.1
1713.2
1713.3
1713.4
1713.5
1714.1
1714.2
1714.3
1714.4
1714.5
1714.6
1715.1
1715.2
1715.3
1715.4
1715.5
1716.1
1716.2
1716.3
1716.4
1717.1
1717.2
1717.3
1717.4
1717.5
1717.6
1718.1
1718.2
1718.3
1718.4

1932.1
1932.2
1932.3
1932.4
1933.1
1933.2
1933.3
1933.4
1934.1
1934.2
1934.3
1934.4
1935.1
1935.2
1935.3
1935.4
1936.1
1936.2
1936.3
1936.4
1937.1
1937.2
1937.3
1937.4
1938.1
1938.2
1938.3
1938.4
1938.5
1939.1
1939.2
1939.3
1939.4
1940.1
1940.2
1940.3
1940.4
1941.1
1941.2
1941.3
1941.4
1942.1
1942.2
1942.3
1942.4
1942.5
1943.1
1943.2
1943.3
1943.4
1943.5
1944.1
1944.2
1944.3
1944.4
1944.5
1945.1
1945.2
1945.3
1945.4
1945.5
1946.1
1946.2
1946.3
1946.4
1946.5
1946.6
1947.1
1947.2
1947.3
1947.4
1947.5
1948.1
1948.2
1948.3
1948.4
1948.5
1949.1
1949.2
1949.3
1949.4
1949.5
1949.6
1950.1
1950.2
1950.3
1950.4
1950.5
1951.1
1951.2
1951.3
1951.4
1952.1
1952.2
1952.3
1952.4
1952.5
1953.1
1953.2
1953.3
1953.4
1953.5
1954.1
1954.2
1954.3
1954.4
1955.1
1955.2
1955.3
1955.4
1955.5
1956.1
1956.2
1956.3
1956.4
1956.5
1957.1
1957.2
1957.3
1957.4
1957.5
1958.1
1958.2
1958.3
1958.4
1958.5
1959.1
1959.2
1959.3
1959.4
1959.5
1960.1
1960.2
1960.3
1960.4
1960.5
1961.1
1961.2
1961.3
1961.4
1962.1
1962.2
1962.3

Unnamed: 0,contract_link,Base_title,First_title,Suspected_second_title,Price_Ruble,contract_number,Scopus,Web_of_science
0,http://123mi.ru/1/contract.php?r=1&n=1&m=1,Прогноз дорожной ситуации на базе статистическ...,Прогноз дорожной ситуации на базе статистическ...,no suspected english title,57400,1.1,Q2,False
1,http://123mi.ru/1/contract.php?r=1&n=1&m=2,Прогноз дорожной ситуации на базе статистическ...,Прогноз дорожной ситуации на базе статистическ...,no suspected english title,53300,1.2,Q2,False
2,http://123mi.ru/1/contract.php?r=1&n=1&m=3,Прогноз дорожной ситуации на базе статистическ...,Прогноз дорожной ситуации на базе статистическ...,no suspected english title,49200,1.3,Q2,False
3,http://123mi.ru/1/contract.php?r=1&n=1&m=4,Прогноз дорожной ситуации на базе статистическ...,Прогноз дорожной ситуации на базе статистическ...,no suspected english title,45100,1.4,Q2,False
4,http://123mi.ru/1/contract.php?r=1&n=1&m=5,Прогноз дорожной ситуации на базе статистическ...,Прогноз дорожной ситуации на базе статистическ...,no suspected english title,41000,1.5,Q2,False
...,...,...,...,...,...,...,...,...
7124,http://123mi.ru/1/contract.php?r=1&n=1972&m=4,Оценка финансовой и нефинансовой эффективности...,Оценка финансовой и нефинансовой эффективности...,Assessment of the financial and non-financial ...,41000,1972.4,Q3Q4,False
7125,http://123mi.ru/1/contract.php?r=1&n=1973&m=1,Перспективы развития сотрудничества России и С...,Перспективы развития сотрудничества России и С...,Prospects for the development of cooperation b...,205820,1973.1,Q2,True
7126,http://123mi.ru/1/contract.php?r=1&n=1973&m=2,Перспективы развития сотрудничества России и С...,Перспективы развития сотрудничества России и С...,Prospects for the development of cooperation b...,109880,1973.2,Q2,True
7127,http://123mi.ru/1/contract.php?r=1&n=1973&m=3,Перспективы развития сотрудничества России и С...,Перспективы развития сотрудничества России и С...,Prospects for the development of cooperation b...,95940,1973.3,Q2,True


In [8]:
# This is used to change the contract number into the number and the author position
new = df_paper["contract_number"].str.split(".", n = 1, expand = True)
df_paper["Contract_number"]= new[0]
  
# making separate last name column from new data frame
df_paper["Author_pos"]= new[1]

Unnamed: 0,contract_link,Base_title,First_title,Suspected_second_title,Price_Ruble,contract_number,Scopus,Web_of_science,Contract_number,Author_pos
0,http://123mi.ru/1/contract.php?r=1&n=1&m=1,Прогноз дорожной ситуации на базе статистическ...,Прогноз дорожной ситуации на базе статистическ...,no suspected english title,57400,1.1,Q2,False,1,1
1,http://123mi.ru/1/contract.php?r=1&n=1&m=2,Прогноз дорожной ситуации на базе статистическ...,Прогноз дорожной ситуации на базе статистическ...,no suspected english title,53300,1.2,Q2,False,1,2
2,http://123mi.ru/1/contract.php?r=1&n=1&m=3,Прогноз дорожной ситуации на базе статистическ...,Прогноз дорожной ситуации на базе статистическ...,no suspected english title,49200,1.3,Q2,False,1,3
3,http://123mi.ru/1/contract.php?r=1&n=1&m=4,Прогноз дорожной ситуации на базе статистическ...,Прогноз дорожной ситуации на базе статистическ...,no suspected english title,45100,1.4,Q2,False,1,4
4,http://123mi.ru/1/contract.php?r=1&n=1&m=5,Прогноз дорожной ситуации на базе статистическ...,Прогноз дорожной ситуации на базе статистическ...,no suspected english title,41000,1.5,Q2,False,1,5
...,...,...,...,...,...,...,...,...,...,...
7124,http://123mi.ru/1/contract.php?r=1&n=1972&m=4,Оценка финансовой и нефинансовой эффективности...,Оценка финансовой и нефинансовой эффективности...,Assessment of the financial and non-financial ...,41000,1972.4,Q3Q4,False,1972,4
7125,http://123mi.ru/1/contract.php?r=1&n=1973&m=1,Перспективы развития сотрудничества России и С...,Перспективы развития сотрудничества России и С...,Prospects for the development of cooperation b...,205820,1973.1,Q2,True,1973,1
7126,http://123mi.ru/1/contract.php?r=1&n=1973&m=2,Перспективы развития сотрудничества России и С...,Перспективы развития сотрудничества России и С...,Prospects for the development of cooperation b...,109880,1973.2,Q2,True,1973,2
7127,http://123mi.ru/1/contract.php?r=1&n=1973&m=3,Перспективы развития сотрудничества России и С...,Перспективы развития сотрудничества России и С...,Prospects for the development of cooperation b...,95940,1973.3,Q2,True,1973,3


In [12]:
# This is the same code as above, except it is used to get the updated data from the contracts 
# This code can be run as many times as needed
end = df_paper.iloc[-1]['Contract_number']
end = int(end)
for contract_number in range(end+1, end+100):
    # Set the number of author positions to pull
    for author_position in range(1, 8):
        # The starting link is http://123mi.ru/1/contract.php?r=1&n=1&m=1
        # A backup for the web archive is: https://web.archive.org/web/20201031161224/http://123mi.ru/1/contract.php?r=1&n=1000&m=1
        # This changes the link every loop to ajust to what we need. 
        URL = 'http://123mi.ru/1/contract.php?r=1&n={}&m={}'.format(contract_number, author_position)
        html = urllib.request.urlopen(URL)
        
        # Parse the html file
        htmlParse = BeautifulSoup(html, 'html.parser')
        for par in htmlParse.find_all("h1"):
            for para in par.find_all("font"):
                coun = para.get_text()
                print(coun)
        # Get all the paragraphs
        count = 0
        for para in htmlParse.find_all("p"):
            count += 1
            # This is a semi hard coded way to find all the paragraphs and check how many it has looped over.
            #print(para)
            if count == 2:
                # Find/retreave the Russian titles
                scopus_Russian_title = para.get_text()
                #everything = scopus_Russian_title
                Russian_title = scopus_Russian_title.split('«')
                Russian_title_all = Russian_title[1].split('»')
                #print(Russian_title[0])
                # This makes sure there isn't a traceback that happens here because the code can take a while. 
                # The point of this code it to split the data to make it easier to find the nessary parts. 
                try:
                    scopus = Russian_title[0].split('(')
                    scopus = scopus[2].split(')')
                    scopus = scopus[0]
                    
                except:
                    scopus = scopus_Russian_title
                    Russian_title = Russian_title[1].split('»')
                    
                # This is used to check if there is multiple titles, because they are always seperated by a \n. 
                if '\n' in Russian_title_all[0]:
                    split = Russian_title_all[0].split('\n')
                    rushed_title = split[0]
                    english_titl = split[1]
                    
                else:
                    # if there is not a \n, then the code will just take the title thats there. 
                    rushed_title = Russian_title_all[0]
                    #print(Russian_title[0])
                    english_titl = 'no suspected english title'
                    
            if count == 2:
                web = para.get_text()
                if 'Web of Science' in web:
                    contains = True
                else:
                    contains = False
                
            if count == 7:
                # Retrieve the price from the data
                Price_Ruble = para.get_text()
                #print(Price_Ruble)
                Price_Ruble = Price_Ruble.split('<font color="#0000FF">')
                Price_Ruble = Price_Ruble[0].split('Общая стоимость услуг, выполняемых Исполнителем в рамках настоящего Договора, составляет ')
                Price_Ruble = Price_Ruble[1].split(' (')
                
            
            
            
        # This takes all the data and puts it into a large dataframe.
        if count == 7:
            break
        data = {'contract_link': [URL], "Base_title" : [Russian_title_all[0]], 'First_title': [rushed_title], 'Suspected_second_title' : [english_titl], 'Price_Ruble': Price_Ruble[0], 'contract_number': coun, 'Scopus': scopus, 'Web_of_science' : contains}
        #print(data)
        df = pd.DataFrame(data) 
        df_paper = df_paper.append(df)
# This can also be removed to not show the dataframe at the end. 
df_paper


1973.1
1973.2
1973.3
1973.4
1973.5
1974.1
1975.1
1976.1
1977.1
1978.1
1979.1
1980.1
1981.1
1982.1
1983.1
1984.1
1985.1
1986.1
1987.1
1988.1
1989.1
1990.1
1991.1
1992.1
1993.1
1994.1
1995.1
1996.1
1997.1
1998.1
1999.1
2000.1
2001.1
2002.1
2003.1
2004.1
2005.1
2006.1
2007.1
2008.1
2009.1
2010.1
2011.1
2012.1
2013.1
2014.1
2015.1
2016.1
2017.1
2018.1
2019.1
2020.1
2021.1
2022.1
2023.1
2024.1

KeyboardInterrupt: 

In [13]:
# Backup save of data before cleaning
df_paper.to_excel('raw_df.xlsx')
# This replaces the punctuation in the columns
df_paper["Base_title"] = df_paper['Base_title'].str.replace('[^\w\s]','', regex=True)
df_paper["First_title"] = df_paper['First_title'].str.replace('[^\w\s]','', regex=True)
df_paper["Suspected_second_title"] = df_paper['Suspected_second_title'].str.replace('[^\w\s]','', regex=True)
df_paper = df_paper[df_paper['Base_title'].astype(bool)]
# Identify and remove contracts not successfully retrieved
df_paper['length'] = df_paper.First_title.str.len()
df_paper['length'] = df_paper['length'].astype(int)  
df_unconfirmed = df_paper[df_paper.length < 20]
df_paper = df_paper[df_paper.length > 20]
# This checks if the price has been colected, and adds removes it if it has not
# I have this in place because empty contracts will have a price of 0
df_paper = df_paper[df_paper['Price_Ruble'] !='0']
# Construct data frames
test = df_paper
df_titles = test.drop_duplicates(subset=['Base_title'])
df_titles = df_titles[['Contract_number', 'Base_title']]
df_authors = df_paper
# Calculated on 8/5/2021
df_authors['Price_Ruble'] = df_authors['Price_Ruble'].astype('float')
df_authors['USD'] = df_authors['Price_Ruble'] * .014
# Backup index reset
df_paper = df_paper.reset_index()
df_paper_original = df_paper

# Website Direct Scrape

This is used to scrape the website and find the all the papers that are still being sold

In [None]:
URL = 'http://123mi.ru/1/?id=2'
html = urllib.request.urlopen(URL)
        
# Parse the html file
htmlParse = BeautifulSoup(html, 'html.parser')

In [None]:
df_id = pd.DataFrame()
lis = list()
for par in htmlParse.find_all("strong"):
    #print(par.get_text())
    if '#' in par.get_text():
        num = par.get_text()
        num = num.strip('#')
        data = {'Contract_number':[num]}
        #print(data)
        df = pd.DataFrame(data) 
        df_id = df_id.append(df)
        lis.append(num)

# Backup of contracts

This creates a backup of all the contracts and adds them to the file it was ran in, I would recommend making a file for them to be stored because it is about 10,000 files.

In [None]:
import requests

for contract_number in range(1, 2000):
    print(contract_number)
    # Set the number of author positions to pull
    for author_position in range(1,6):
        # The starting link is http://123mi.ru/1/contract.php?r=1&n=1&m=1
        # A backup for the web archive is: https://web.archive.org/web/20201031161224/http://123mi.ru/1/contract.php?r=1&n=1000&m=1
        # This changes the link every loop to ajust to what we need. 
        url = 'http://123mi.ru/1/contract.php?r=1&n={}&m={}'.format(contract_number, author_position)
        r = requests.get(url, allow_redirects=True)
        name = 'Contract{}Position{}.html'.format(contract_number, author_position)
        open(name, 'wb').write(r.content)